Approaches that dynamically adapt the processing resources to application needs under multiple optimization goals and constraints can be characterized into the application-specific and feedback-based techniques. Whereas application-specific approaches typically statically use an offline stage to determine the best configuration for each known workload, feedback-based approaches, using, e.g., control theory, adapt the system without the need of knowing the effect of workload on these goals.
In this paper, we evaluate a state-of-the-art approach of each of the two categories and compare them for image processing applications in terms of energy consumption and number of deadline misses on a given many-core architecture. In addition, we propose a second feedback-based approach that is based on finite state machines (FSMs). The obtained results suggest that whereas the state-of-the-art application-specific approach is able to meet a specified latency deadline whenever possible while consuming the least amount of energy, it requires a perfect characterization of the workload on a given many-core system. If such knowledge is not available, the feedback-based approaches have their strengths in achieving comparable energy savings, but missing deadlines more ofte}, author = {Esper, Khalil and Wildermann, Stefan and Teich, Jürgen}, booktitle = {Proceedings of the Workshop on Next Generation Real-Time Embedded Systems (NG-RES), OASICS Vol. 87}, date = {2021-01-20/2021-01-20}, doi = {10.4230/OASIcs.NG-RES.2021.1}, faupublication = {yes}, isbn = {978-3-95977-178-8}, keywords = {Soft real-time; Energy optimization; Control-theory; Timing analysis; Dynamic voltage and frequency scaling; Finite state machines; Multi-core; Many-core;}, month = {Jan}, pages = {1:1--1:12}, peerreviewed = {Yes}, title = {{A} {Comparative} {Evaluation} of {Latency}-{Aware} {Energy} {Optimization} {Approaches} in {Many}-{Core} {Systems}}, url = {https://drops.dagstuhl.de/opus/volltexte/2021/13477}, venue = {Budapest}, year = {2021} } @inproceedings{faucris.118189984, abstract = {Using Field Programmable Gate Arrays (FPGAs) as accelerators for image or video processing operations and algorithms has gained increasing attention over the last few years. One reason for that is FPGAs are able to exploit both temporal and spatial parallelism. In this paper two platforms for FPGA-based real-time image and video processing are presented and compared against each other. With both of these platforms it is possible to update the physical resources during run-time by exploiting the dynamic partial reconfiguration capabilities of Xilinx Virtex FPGAs. The analysis of both platforms with respect to their benefits and drawbacks has led to the concept of an optimal FPGA-based dynamically and partially reconfigurable platform for real-time video and image processing. ©2008 IEEE.}, address = {New York}, author = {Angermeier, Josef and Claus, Christopher and Stechele, Walter and Teich, Jürgen}, booktitle = {Proceedings of International Conference on Field-Programmable Logic and Applications}, date = {2008-09-08/2008-09-10}, doi = {10.1109/FPL.2008.4630015}, faupublication = {yes}, note = {UnivIS-Import:2015-04-16:Pub.2008.tech.IMMD.inform.acompa}, pages = {587-590}, publisher = {IEEE Press}, title = {{A} comparison of embedded reconfigurable video-processing architectures}, url = {http://www.kip.uni-heidelberg.de/fpl08/titel/index.php}, venue = {Heidelberg}, year = {2008} } @phdthesis{faucris.119577744, author = {Teich, Jürgen}, faupublication = {no}, note = {UnivIS-Import:2015-04-02:Pub.1993.tech.IMMD.inform.acompi{\_}0}, school = {Friedrich-Alexander-Universität Erlangen-Nürnberg}, title = {{A} {Compiler} for {Application}-{Specific} {Processor} {Arrays} ({Zugl}. {Doktorarbeit})}, year = {1993} } @inproceedings{faucris.203946833, abstract = {New relaxed quality standards laid down by approximate computing enrich the design pool with architectures dissipating less power, consuming fewer resources or with smaller latencies. In LUTbased FPGA logic approximation, the number of LUTs and latency associated to a design can be optimized by allowing the approximation of circuit results. In this paper, we present techniques for automatic design space exploration (DSE) of Boolean function falsifications and the ability and impact to reduce resources usage as well as the length of critical paths on LUT-based FPGAs. Our experiments give evidence that resource reductions of about 20% are easily achievable for error rates amounting to less than 0.05% w.r.t. accurate design}, author = {Echavarria Gutiérrez, Jorge Alfonso and Wildermann, Stefan and Teich, Jürgen}, booktitle = {Proceedings of 2018 International Conference on Field Programmable Technology}, date = {2018-12-10/2018-12-14}, doi = {10.1109/fpt.2018.00065}, faupublication = {yes}, keywords = {Approximate Computing; Logic Simplification; Design Space Exploration; FPGAs.}, peerreviewed = {Yes}, title = {{AConFPGA}: {A} {Multiple}-{Output} {Boolean} {Function} {Approximation} {DSE} {Technique} {Targeting} {FPGAs}}, venue = {Naha, Okinawa}, year = {2018} } @inproceedings{faucris.115219104, author = {Mühleis, Nina and Glaß, Michael and Zhang, Liyuan and Teich, Jürgen}, booktitle = {ACM SIGBED Review - Work-in-Progress (WiP) Session of the 2nd International Conference on Cyber Physical Systems (ICCPS 2011)}, doi = {10.1145/2000367.2000372}, faupublication = {yes}, note = {UnivIS-Import:2015-04-14:Pub.2011.tech.IMMD.inform.acosim}, pages = {23-26}, peerreviewed = {unknown}, title = {{A} {Co}-{Simulation} {Approach} for {Control} {Performance} {Analysis} during {Design} {Space} {Exploration} of {Cyber}-{Physical} {Systems}}, volume = {8(2)}, year = {2011} } @inproceedings{faucris.122569524, abstract = {Control applications have become an integral part of modern networked embedded systems. However, there often exists a gap between control engineering and system design. The control engineer has detailed knowledge about the algorithms but is abstracting from the system architecture and implementation. On the other hand, the system designer aims at achieving high-quality implementations based on quality constraints specified by the control engineer. This may result in either an overdesigned system in case the specifications are pessimistic or an unsafe system behavior when specifications are too optimistic. Thus, future design automation approaches have to consider the quality of control applications both as design objectives and design constraints to achieve safe yet highly optimized system implementations. The work at hand introduces an automatic tool flow at the Electronic System Level (ESL) that enables the optimization of a system implementation with quality of control being introduced as a principal design objective, like the maximum braking distance, while respecting constraints like maximum slip to ensure maneuverability of a car. The gap between mathematically well-defined models for system synthesis and common analysis techniques for control quality is bridged by co-simulation: A SystemC-based virtual prototype of a distributed controller implementation is combined with high-level models of the plants specified in Matlab/Simulink. Through a model transformation, the traditional development process of control applications is combined with state-of-the-art ESL techniques, ensuring model consistency while enabling a high degree of automation. © 2012 IEEE.}, address = {New York, NY, USA}, author = {Glaß, Michael and Teich, Jürgen and Zhang, Liyuan}, booktitle = {Proc. of the 2012 International Conference on Embedded Computer Systems: Architectures, Modeling, and Simulation (SAMOS)}, date = {2012-07-16/2012-07-19}, doi = {10.1109/SAMOS.2012.6404200}, faupublication = {yes}, isbn = {978-1-4673-2295-9}, note = {UnivIS-Import:2015-04-16:Pub.2012.tech.IMMD.inform.acosim}, pages = {355-362}, publisher = {IEEE Press}, title = {{A} {Co}-simulation {Approach} for {System}-{Level} {Analysis} of {Embedded} {Control} {Systems}}, venue = {Samos}, year = {2012} } @inproceedings{faucris.118569484, address = {Hamburg, Germany}, author = {Zhang, Liyuan and Glaß, Michael and Streubühr, Martin and Teich, Jürgen and von Schwerin, Andreas and Liu, Kai}, booktitle = {Tagungsunterlagen Methoden und Beschreibungssprachen zur Modellierung und Verifikation von Schaltungen und Systemen (MBMV)}, date = {2012-03-05/2012-03-07}, faupublication = {yes}, isbn = {978-1-4673-2295-9}, note = {UnivIS-Import:2015-04-16:Pub.2012.tech.IMMD.inform.actoro}, pages = {193-204}, publisher = {Verlag Dr. Kovac}, title = {{Actor}-oriented {Modeling} and {Simulation} of {Cut}-through {Communication} in {Network} {Controllers}}, venue = {Kaiserslautern}, year = {2012} } @inproceedings{faucris.117071504, abstract = {Embedded real-time image processing systems have to process huge amounts of data with limited resources and energy. Hence high efficiency is not only required for manual, but also for automatic system generation. Therefore, in order to allow for different optimizations, a system specification must be such that important algorithm properties are accessible to the system design software. In this paper, we present a new method how multi-dimensional image processing algorithms can be modeled by actor-oriented data flow semantics. Using the example of a binary morphological reconstruction, we investigate the modeling requirements posed by point, local and global image processing algorithms. We show how they can be taken into account in our approach, so that efficient implementation and analysis in terms of buffer size and throughput is possible. In particular, by the explicit specification of the communication behavior, both static and data dependent algorithms are supported allowing for a complete system specification. © 2007 IEEE.}, author = {Keinert, Joachim and Falk, Joachim and Haubelt, Christian and Teich, Jürgen}, booktitle = {Proceedings of the 2007 IEEE/ACM/IFIP Workshop of Embedded Systems for Real-Time Multimedia (ESTIMEDIA 2007)}, date = {2007-10-04/2007-10-05}, doi = {10.1109/ESTMED.2007.4375815}, faupublication = {yes}, isbn = {9781424416547}, pages = {113-118}, peerreviewed = {unknown}, title = {{Actor}-oriented modeling and simulation of sliding window image processing algorithms}, venue = {Salzburg}, year = {2007} } @inproceedings{faucris.124193784, author = {Kiesel, Rainer and Löhlein, Otto and Terzis, Anestis and Streubühr, Martin and Haubelt, Christian and Teich, Jürgen}, booktitle = {Methoden und Beschreibungssprachen zur Modellierung und Verifikation von Schaltungen und Systemen}, faupublication = {yes}, pages = {117-126}, peerreviewed = {unknown}, title = {{Actor}-oriented {Modeling} of {Driver} {Assistance} {Systems} for {Efficient} {Multi}-{Core} {ECU} {Implementation}}, venue = {Dresden}, year = {2010} } @inproceedings{faucris.123122604, author = {Kutzer, Philip and Streubühr, Martin and Haubelt, Christian and Teich, Jürgen and von Schwerin, Andreas}, booktitle = {Proceedings of the Embedded World Conference}, date = {2011-03-01/2011-03-03}, faupublication = {yes}, pages = {1-10}, peerreviewed = {unknown}, title = {{Actor}-oriented {Modeling} of {Industrial} {Ethernet} in the {Automation} {Domain} {Using} {SystemC}}, venue = {Nuermberg}, year = {2011} } @inproceedings{faucris.122834624, author = {Roloff, Sascha and Pöppl, Alexander and Schwarzer, Tobias and Wildermann, Stefan and Baader, Michael and Glaß, Michael and Hannig, Frank and Teich, Jürgen}, booktitle = {Proceedings of the 6th ACM SIGPLAN X10 Workshop (X10)}, faupublication = {yes}, pages = {24-29}, peerreviewed = {unknown}, title = {{ActorX10}: {An} {Actor} {Library} for {X10}}, venue = {Santa Barbara, CA}, year = {2016} } @incollection{faucris.228838653, author = {Roloff, Sascha and Hannig, Frank and Teich, Jürgen}, booktitle = {Modeling and Simulation of Invasive Applications and Architectures}, doi = {10.1007/978-981-13-8387-8{\_}6}, editor = {Sascha Roloff, Frank Hannig, Jürgen Teich}, faupublication = {yes}, month = {Jan}, note = {CRIS-Team WoS Importer:2019-11-08}, pages = {129-164}, peerreviewed = {unknown}, series = {Computer Architecture and Design Methodologies}, title = {{ActorX10} and {Run}-{Time} {Application} {Embedding}}, year = {2019} } @inproceedings{faucris.123471964, author = {Witterauf, Michael and Tanase, Alexandru-Petru and Hannig, Frank and Teich, Jürgen}, booktitle = {Proceedings of the 11th International Summer School on Advanced Computer Architecture and Compilation for High-Performance and Embedded Systems (ACACES)}, date = {2015-07-12/2015-07-18}, faupublication = {yes}, isbn = {978-88-905806-3-5}, pages = {205-208}, peerreviewed = {Yes}, publisher = {HiPEAC}, title = {{Adaptive} {Fault} {Tolerance} in {Tightly} {Coupled} {Processor} {Arrays} with {Invasive} {Computing}}, venue = {Fiuggi}, year = {2015} } @inproceedings{faucris.122626284, abstract = {Fault tolerance is a basic necessity to make today's complex systems reliable. Adequate fault tolerance, however, demands a high degree of redundancy, possibly wasting resources when the fault probability is low or when some applications do not require fault tolerance. Under the term adaptive fault tolerance, we investigate means to instead provide on-demand fault tolerance on multi-core systems dynamically and according to application and environmental needs. Such means are provided on a per-application basis by invasive computing, a recent paradigm for resource-aware programming and design of parallel systems: applications request resources in an invade phase, infect the acquired resources with code and data, and finally release them in a retreat phase. We show how to use these simple but powerful constructs to adaptively tolerate faults and that invasive computing harmonizes well with many existing fault tolerance approaches. Finally, a case study on adaptively providing fault tolerance for loops demonstrates how effective invasive computing is for adapting to a varying soft error rate and handling of faults.}, author = {Witterauf, Michael and Tanase, Alexandru-Petru and Teich, Jürgen and Lari, Vahid and Zwinkau, Andreas and Snelting, Gregor}, booktitle = {Proceedings of the 2015 NASA/ESA Conference on Adaptive Hardware and Systems}, date = {2016-06-15/2015-06-18}, doi = {10.1109/AHS.2015.7231155}, faupublication = {yes}, isbn = {9781467375016}, keywords = {Adaptation models; Fault tolerant systems; Hardware; Redundancy; Runtime}, pages = {1-8}, peerreviewed = {unknown}, publisher = {Institute of Electrical and Electronics Engineers Inc.}, title = {{Adaptive} fault tolerance through invasive computing}, venue = {Montreal}, year = {2015} } @book{faucris.110176044, editor = {Mitra, Tulika and Teich, Jürgen and Thiele, Lothar}, faupublication = {yes}, peerreviewed = {automatic}, series = {Dagstuhl Reports}, title = {{Adaptive} {Isolation} for {Predictability} and {Security}}, volume = {6}, year = {2016} } @inproceedings{faucris.119495024, author = {Teich, Jürgen}, booktitle = {Proc. of the 18th Int. Workshop on Software and Compilers for Embedded Systems (SCOPES 2015)}, date = {2015-06-01/2015-06-03}, doi = {10.1145/2764967.2771821}, faupublication = {yes}, pages = {2}, peerreviewed = {unknown}, title = {{Adaptive} {Isolation} for {Predictable} {MPSoC} {Stream} {Processing}}, venue = {Schloss Rheinfels, St. Goar}, year = {2015} } @article{faucris.237078949, author = {Brand, Peter and Falk, Joachim and Ah Sue, Jonathan and Brendel, Johannes and Hasholzner, Ralph and Teich, Jürgen}, doi = {10.1109/TMC.2020.2988651}, faupublication = {yes}, journal = {IEEE Transactions on Mobile Computing}, pages = {2518-2535}, peerreviewed = {Yes}, title = {{Adaptive} {Predictive} {Power} {Management} for {Mobile} {LTE} {Devices}}, volume = {20}, year = {2021} } @inproceedings{faucris.118365984, abstract = {With the ongoing development of new FPGA generations, the reconfiguration time decreases and therefore the benefit of runtime reconfiguration increases. In this paper, we describe how to use runtime reconfiguration to improve the efficiency of transmitting streaming data on a communication channel shared with real-time applications. This means, the bandwidth that the streaming data has available is dynamically changing. To use the bandwidth effectively, different modules can be loaded on the reconfigurable hardware. These modules have a tradeoff between bandwidth and area requirements. The target now is to find an optimal reconfiguration schedule that minimizes an objective function consisting of two conflicting objectives: reducing the average area needed and providing a certain quality of transmission. In this paper, a model for this scheduling problem is presented and an Integer Linear Programming (ILP) formulation is introduced to calculate an optimal offline solution for benchmarking. In addition, an online scheduling system is presented. It uses the current delay of the streaming application to calculate the schedule. Extensive simulations have been made to show the benefits of the proposed solution.© 2010 IEEE.}, author = {Ziermann, Tobias and Teich, Jürgen}, booktitle = {Proc. 17th Reconfigurable Architectures Workshop}, date = {2010-04-19/2010-04-23}, doi = {10.1109/IPDPSW.2010.5470738}, faupublication = {yes}, note = {UnivIS-Import:2015-04-16:Pub.2010.tech.IMMD.inform.adapti}, pages = {1-4}, title = {{Adaptive} {Traffic} {Scheduling} {Techniques} for {Mixed} {Real}-{Time} and {Streaming} {Applications} on {Reconfigurable} {Hardware}}, venue = {Atlanta}, year = {2010} } @inproceedings{faucris.118367084, abstract = {In this paper we present an almost automatic synthesis of a highly complex, throughput optimized architecture of an adaptive multiresolution filter as used in medical image processing for FPGAs. The filter consists of 16 parallel working modules, where the most computationally intensive module achieves software pipelining of a factor of 85, that is, computations of 85 iterations overlap each other. By applying a state-of-the-art high-level synthesis tool, we show that this approach can be used for real world applications. In addition, we show that our high-level synthesis tool is capable of significantly reducing the well known productivity gap of embedded system design by almost two orders of magnitude. Finally, we can conclude that the FPGA implementation of the multiresolution image processing algorithm is far ahead of a comparable implementation for graphics cards in terms of power efficiency. © 2010 IEEE.}, author = {Hannig, Frank and Schmid, Moritz and Teich, Jürgen and Hornegger, Heinz}, booktitle = {Proc. IEEE International Conference on Field Programmable Technology}, date = {2010-12-08/2010-12-10}, doi = {10.1109/FPT.2010.5681464}, faupublication = {yes}, isbn = {978-1-4244-8982-4}, note = {UnivIS-Import:2015-04-16:Pub.2010.tech.IMMD.inform.adeepl}, pages = {485-490}, title = {{A} {Deeply} {Pipelined} and {Parallel} {Architecture} for {Denoising} {Medical} {Images}}, venue = {Beijing}, year = {2010} } @inproceedings{faucris.117709284, abstract = {Massively parallel processor array architectures can be used as hardware accelerators for a plenty of dataflow dominant applications. Bilateral filtering is an example of a state-of-the-art algorithm in medical imaging, which falls in the class of 2D adaptive filter algorithms. In this paper, we propose a semi-automatic mapping methodology for the generation of hardware accelerators for such a generic class of adaptive filtering applications in image processing. The final architecture deliver similar synthesis results as a hand-tuned design. © 2006 IEEE.}, author = {Dutta, Hritam and Hannig, Frank and Heigl, Benno and Hornegger, Heinz and Teich, Jürgen}, booktitle = {Proceedings of IEEE 17th International Conference on Application-specific Systems, Architectures, and Processors}, date = {2006-09-11/2006-09-13}, doi = {10.1109/ASAP.2006.4}, faupublication = {yes}, isbn = {978-0-7695-2682-9}, note = {UnivIS-Import:2015-04-16:Pub.2006.tech.IMMD.inform.adesig}, pages = {331-337}, publisher = {Institute of Electrical and Electronics Engineers}, title = {{A} {Design} {Methodology} for {Hardware} {Acceleration} of {Adaptive} {Filter} {Algorithms} in {Image} {Processing}}, venue = {Steamboat Springs, CO}, year = {2006} } @article{faucris.203550747, author = {Weichslgartner, Andreas and Wildermann, Stefan and Gangadharan, Deepak and Glaß, Michael and Teich, Jürgen}, doi = {10.1145/3274665}, faupublication = {yes}, journal = {ACM Transactions on Embedded Computing Systems}, peerreviewed = {Yes}, title = {{A} {Design}-{Time}/{Run}-{Time} {Application} {Mapping} {Methodology} for {Predictable} {Execution} {Time} in {MPSoCs}}, year = {2018} } @inproceedings{faucris.119911044, abstract = {The constantly growing amount of semiconductors in au-tomotive systems increases the number of possible defect mechanisms, and therefore raises also the effort to main-tain a sufficient level of quality and reliability. A promising solution to this problem is the on-line application of struc-tural tests in key components, typically ECUs. In this work, an approach for the optimized integration of both Software-Based Self-Tests (SBST) and Built-In Self-Tests (BIST) into E/E architectures is presented. The approach integrates the execution of the tests non-intrusively, i. e., it (a) does not af-fect functional applications and (b) does not require costly changes in the communication schedules or additional com-munication overhead. Via design space exploration, opti-mized implementations with respect to multiple con icting objectives, i. e., monetary costs, safety, test quality, and re-quired execution time are derived. Copyright 2014 ACM.}, address = {New York, NY, USA}, author = {Reimann, Felix and Glaß, Michael and Teich, Jürgen and Cook, Alejandro and Gomez, Laura Rodriguez and Ull, Dominik and Wunderlich, Hans-Joachim and Engelke, Piet and Abelein, Ulrich}, booktitle = {Proc. of The 51st Annual Design Automation Conference (DAC)}, date = {2014-06-02/2014-06-05}, doi = {10.1145/2593069.2602971}, faupublication = {yes}, isbn = {978-1-4799-3017-3}, note = {UnivIS-Import:2015-04-16:Pub.2014.tech.IMMD.inform.advanc}, pages = {8}, publisher = {ACM Press}, title = {{Advanced} {Diagnosis}: {SBST} and {BIST} {Integration} in {Automotive} {E}/{E} {Architectures}}, venue = {San Francisco, CA}, year = {2014} } @inproceedings{faucris.117434504, author = {Kissler, Dmitrij and Hannig, Frank and Kupriyanov, Olexiy and Teich, Jürgen}, booktitle = {Proceedings of the 2nd International Workshop on Reconfigurable Communication-Centric System-on-Chips (ReCoSoC)}, date = {2006-07-03/2006-07-05}, faupublication = {yes}, pages = {31-37}, peerreviewed = {unknown}, title = {{A} {Dynamically} {Reconfigurable} {Weakly} {Programmable} {Processor} {Array} {Architecture} {Template}}, year = {2006} } @inproceedings{faucris.116446484, address = {London}, author = {Ahmadinia, Ali and Bobda, Christophe and Koch, Dirk and Majer, Mateusz and Teich, Jürgen}, booktitle = {Proceedings of International Conference on Field-Programmable Logic and Applications (FPL)}, date = {2004-08-30/2004-09-01}, faupublication = {yes}, isbn = {3-540-22989-2}, note = {UnivIS-Import:2015-04-16:Pub.2004.tech.IMMD.inform.adynam{\_}15}, pages = {1032-1036}, publisher = {Springer}, series = {Lecture Notes in Computer Science (LNCS)}, title = {{A} {Dynamic} {NoC} {Approach} for {Communication} in {Reconfigurable} {Devices}}, venue = {Antwerp}, volume = {3203}, year = {2004} } @inproceedings{faucris.116446264, abstract = {Recent generations of FPGAs allow run-time partial reconfiguration. To increase the efficacy of reconfigurable computing, multitasking on FPGAs is proposed. One of the challenging problems in multitasking systems is online template placement. In this paper, we describe how existing algorithms work, and propose a new free space manager which is one main part of the placement algorithm. The decision where to place a new module depends on its finishing time mobility. Therefore the proposed algorithm is a combination of scheduling and placement. The simulation results show a better performance against existing methods. © Springer-Verlag 2004.}, address = {Berlin, Heidelberg}, author = {Ahmadinia, Ali and Bobda, Christophe and Teich, Jürgen}, booktitle = {Proceedings of 17th International Conference on Architecture of Computing Systems, Lecture Notes in Computer Science}, date = {2004-03-23/2004-03-26}, faupublication = {yes}, isbn = {3-540-21238-8}, note = {UnivIS-Import:2015-04-16:Pub.2004.tech.IMMD.inform.adynam}, pages = {125 - 139}, publisher = {Springer-verlag}, title = {{A} {Dynamic} {Scheduling} and {Placement} {Algorithm} for {Reconfigurable} {Hardware}}, venue = {Augsburg}, volume = {2981}, year = {2004} } @inproceedings{faucris.118192844, abstract = {This paper presents a feasibility-preserving crossover and mutation operator for evolutionary algorithms for constrained combinatorial problems. This novel operator is driven by an adapted Pseudo-Boolean solver that guarantees feasible offspring solutions. Hence, this allows the evolutionary algorithm to focus on the optimization of the objectives instead of searching for feasible solutions. Based on a proposed scalable testsuite, six specific testcases are introduced that allow a sound comparison of the feasibility-preserving operator to known methods. The experimental results show that the introduced approach is superior to common methods and competitive to a recent state-of-the-art decoding technique. © 2008 Springer-Verlag Berlin Heidelberg.}, address = {Berlin, Heidelberg}, author = {Glaß, Michael and Lukasiewycz, Martin and Teich, Jürgen}, booktitle = {Proceedings of the 10th International Conference on Parallel Problem Solving from Nature}, date = {2008-09-13/2008-09-17}, doi = {10.1007/978-3-540-87700-4}, faupublication = {yes}, isbn = {978-3-540-87699-1}, note = {UnivIS-Import:2015-04-16:Pub.2008.tech.IMMD.inform.afeasi{\_}8}, pages = {919-928}, publisher = {Springer-verlag}, series = {Lecture Notes in Computer Science (LNCS)}, title = {{A} {Feasibility}-{Preserving} {Crossover} and {Mutation} {Operator} for {Constrained} {Combinatorial} {Problems}}, url = {http://ls11-www.cs.uni-dortmund.de/ppsn/ppsn10/}, venue = {Dortmund}, volume = {5199}, year = {2008} } @inproceedings{faucris.122628044, abstract = {Meta-heuristic optimization approaches are commonly applied to many discrete optimization problems. Many of these optimization approaches are based on a local search operator like, e.g., the mutate or neighbor operator that are used in Evolution Strategies or Simulated Annealing, respectively. However, the straightforward implementations of these operators tend to deliver infeasible solutions in constrained optimization problems leading to a poor convergence. In this paper, a novel scheme for a local search operator for discrete constrained optimization problems is presented. By using a sophisticated methodology incorporating a backtracking-based ILP solver, the local search operator preserves the feasibility also on hard constrained problems. In detail, an implementation of the local serach operator as a feasibility-preserving mutate and neighbor operator is presented. To validate the usability of this approach, scalable discrete constrained testcases are introduced that allow to calculate the expected number of feasible solutions. Thus, the hardness of the testcases can be quantified. Hence, a sound comparison of different optimization methodologies is presented. © 2008 IEEE.}, author = {Lukasiewycz, Martin and Glaß, Michael and Haubelt, Christian and Teich, Jürgen}, booktitle = {Proceedings of the 2008 IEEE Congress on Evolutionary Computation (CEC 2008)}, doi = {10.1109/CEC.2008.4631058}, faupublication = {yes}, isbn = {9781424418237}, pages = {1968-1975}, peerreviewed = {unknown}, title = {{A} feasibility-preserving local search operator for constrained discrete optimization problems}, venue = {Hong Kong}, year = {2008} } @inproceedings{faucris.121374264, author = {Majer, Mateusz and Ahmadinia, Ali and Bobda, Christophe and Teich, Jürgen}, booktitle = {Proceedings of the Dynamically Reconfigurable Systems Workshop (DRS'2006)}, date = {2003-03-16/2003-03-16}, faupublication = {yes}, pages = {183-194}, peerreviewed = {unknown}, title = {{A} {Flexible} {Reconfiguration} {Manager} for the {Erlangen} {Slot} {Machine}}, venue = {Frankfurt/Main}, year = {2006} } @inproceedings{faucris.117390944, author = {Ziener, Daniel and Wildermann, Stefan and Oetken, Andreas and Weichslgartner, Andreas and Teich, Jürgen}, booktitle = {Proceedings of the Workshop on Computer Vision on Low-Power Reconfigurable Architectures at FPL 2011}, date = {2011-09-04/2011-09-04}, faupublication = {yes}, pages = {29-30}, peerreviewed = {unknown}, title = {{A} {Flexible} {Smart} {Camera} {System} based on a {Partially} {Reconfigurable} {Dynamic} {FPGA}-{SoC}}, venue = {Chania, Crete}, year = {2011} } @misc{faucris.123162644, author = {Dutta, Hritam and Hannig, Frank and Teich, Jürgen}, faupublication = {yes}, peerreviewed = {automatic}, title = {{A} {Formal} {Methodology} for {Hierarchical} {Partitioning} of {Piecewise} {Linear} {Algorithms}}, year = {2006} } @inproceedings{faucris.123144604, author = {Falk, Joachim and Keinert, Joachim and Haubelt, Christian and Teich, Jürgen and Bhattacharyya, Shuvra S.}, booktitle = {Proc. of the 8th ACM & IEEE international conference on Embedded software (EMSOFT'2008)}, date = {2008-10-20/2008-10-22}, faupublication = {yes}, pages = {189-198}, peerreviewed = {unknown}, title = {{A} {Generalized} {Static} {Data} {Flow} {Clustering} {Algorithm} for {MPSoC} {Scheduling} of {Multimedia} {Applications}}, venue = {Atlanta, Georgia}, year = {2008} } @inproceedings{faucris.110017644, author = {Kissler, Dmitrij and Kupriyanov, Olexiy and Hannig, Frank and Koch, Dirk and Teich, Jürgen}, booktitle = {Proceedings of the International Conference on Computer Design (CDES)}, faupublication = {yes}, pages = {189-195}, peerreviewed = {unknown}, title = {{A} {Generic} {Framework} for {Rapid} {Prototyping} of {System}-on-{Chip} {Designs}}, venue = {Las Vegas, NV}, year = {2006} } @inproceedings{faucris.107843164, author = {Salcic, Zoran and Nadeem, Muhammad and Park, Heejong and Teich, Jürgen}, booktitle = {Proceedings Emerging Technologies and Factory Automation (ETFA), 2016 IEEE 21st International Conference}, faupublication = {yes}, pages = {1-4}, peerreviewed = {unknown}, title = {{A} heterogeneous multi-core {SoC} for mixed criticality industrial automation systems}, year = {2016} } @inproceedings{faucris.124116784, author = {Özkan, Mehmet Akif and Reiche, Oliver and Hannig, Frank and Teich, Jürgen}, booktitle = {Proceedings of the Fourth International Workshop on FPGAs for Software Programmers (FSP)}, date = {2017-09-07}, faupublication = {yes}, isbn = {978-3-8007-4443-5}, peerreviewed = {Yes}, publisher = {VDE}, title = {{A} {Highly} {Efficient} and {Comprehensive} {Image} {Processing} {Library} for {C}++-based {High}-{Level} {Synthesis}}, url = {https://ieeexplore.ieee.org/document/8084549}, venue = {Ghent}, year = {2017} } @inproceedings{faucris.117072824, abstract = {In this paper a new class of highly parameterizable coarse-grained reconfigurable architectures called weakly programmable processor arrays is discussed. The main advantages of the proposed architecture template are the possibility of partial and differential reconfiguration and the systematical classification of different architectural parameters which allow to trade-off flexibility and hardware cost. The applicability of our approach is tested in a case study with different interconnect topologies on an FFGA platform. The results show substantial flexibility gains with only marginal additional hardware cost. © 2006 IEEE.}, author = {Kissler, Dmitrij and Kupriyanov, Olexiy and Hannig, Frank and Teich, Jürgen}, booktitle = {Proceedings of the IEEE International Conference on Field Programmable Technology (FPT 2006)}, date = {2006-12-13/2006-12-15}, doi = {10.1109/FPT.2006.270293}, faupublication = {yes}, isbn = {9780780397286}, pages = {105-112}, peerreviewed = {unknown}, title = {{A} highly parameterizable parallel processor array architecture}, venue = {Bangkok}, year = {2006} } @inproceedings{faucris.121673464, author = {Bednara, Marcus and Grabbe, Cornelia and J. Shokrollahi, J. and Teich, Jürgen and von zur Gathen, J.}, booktitle = {Proceedings of the International Parallel and Distributed Processing Symposium (IPDPS-2003)}, faupublication = {no}, note = {UnivIS-Import:2015-04-16:Pub.2003.tech.IMMD.inform.ahighp}, pages = {189}, series = {A High Performance VLIW Processor for Finite Field Arithmetic}, title = {{A} {High} {Performance} {VLIW} {Processor} for {Finite} {Field} {Arithmetic}}, venue = {Nice}, year = {2003} } @article{faucris.121046464, abstract = {New standards in signal, multimedia, and network processing for embedded electronics are characterized by computationally intensive algorithms, high flexibility due to the swift change in specifications. In order to meet demanding challenges of increasing computational requirements and stringent constraints on area and power consumption in fields of embedded engineering, there is a gradual trend towards coarse-grained parallel embedded processors. Furthermore, such processors are enabled with dynamic reconfiguration features for supporting time- and space-multiplexed execution of the algorithms. However, the formidable problem in efficient mapping of applications (mostly loop algorithms) onto such architectures has been a hindrance in their mass acceptance. In this paper we present (a) a highly parameterizable, tightly coupled, and reconfigurable parallel processor architecture together with the corresponding power breakdown and reconfiguration time analysis of a case study application, (b) a retargetable methodology for mapping of loop algorithms, (c) a co-design framework for modeling, simulation, and programming of such architectures, and (d) loosely coupled communication with host processor. © 2008 Elsevier B.V. All rights reserved.}, author = {Dutta, Hritam and Kissler, Dmitrij and Hannig, Frank and Kupriyanov, Olexiy and Teich, Jürgen and Pottier, Bernard}, doi = {10.1016/j.micpro.2008.08.007}, faupublication = {yes}, journal = {Microprocessors and Microsystems}, keywords = {Architecture/compiler co-design; Coarse-grained embedded parallel processors; Communication; Mapping of loop programs; Reconfiguration}, note = {UnivIS-Import:2015-04-14:Pub.2009.tech.IMMD.inform.aholis}, pages = {53-62}, peerreviewed = {Yes}, title = {{A} {Holistic} {Approach} for {Tightly} {Coupled} {Reconfigurable} {Parallel} {Processors}}, volume = {33}, year = {2009} } @article{faucris.112123704, author = {Schöber, Volker and Bringmann, Oliver and Herkersdorf, Andreas and Stechele, Walter and Wehn, Norbert and May, Matthias and Ziener, Daniel and Bouajila, Abdelmajid and Baldin, Daniel and Zeppenfeld, Johannes and Sanders, Björn and Teich, Jürgen and Sebastian, Maurice and Ernst, Rolf and Treytnar, Dieter}, faupublication = {yes}, journal = {newsletter edacentrum}, note = {UnivIS-Import:2015-03-09:Pub.2009.tech.IMMD.inform.aisaut}, pages = {5-13}, peerreviewed = {No}, title = {{AIS}-{Autonomous} {Integrated} {Systems}}, volume = {4}, year = {2009} } @inproceedings{faucris.117990004, abstract = {In this paper, we introduce a methodology for rapid prototyping of application-specific instruction set processors (ASIPs) including the automatic generation of bit-true and cycle-accurate instruction-set simulators and corresponding compiler (re)targets. The methodology is based on ASMs (abstract state machines) as the underlying formal model for describing a processor's behavior. We explain the major advantages of using ASMs and outline the main tool flow from graphical entry of a processor's major RTL building blocks and simulator generation as well as the current status of our project.}, author = {Fischer, Dirk and Teich, Jürgen and Trinkert, Stefan and Weper, Ralph}, booktitle = {ACM SIG Proc. International Conference on Compilers, Architectures and Synthesis for Embedded Systems}, date = {2000-11-17/2000-11-18}, faupublication = {no}, note = {UnivIS-Import:2015-04-16:Pub.2000.tech.IMMD.inform.ajoine}, pages = {26 - 33}, title = {{A} {Joined} {Architecture}/{Compiler} {Environment} for {ASIPs}}, venue = {San Jose, CA}, year = {2000} } @inproceedings{faucris.201567465, abstract = {Field Programmable Gate Arrays (FPGAs) are continually improving their computing capabilities and energy efficiency.
Yet, programming FPGAs remains a time-consuming task and requires expert knowledge to obtain good performance.
Whereas recent advancements in High-Level Synthesis (HLS) promise to solve this problem, today’s HLS tools need
low-level optimizations such that an application-tailored FPGA implementation for a target algorithm needs to
be described using vendor-specific compiler hints and requires code restructuring. Despite the pursuit of new programming methodologies for many-core, multi-threading, or vector architectures, the FPGA community mostly tries
to improve the design techniques from existing programming languages that are either sequential or developed for
other computing platforms. In this paper, we investigate a state-of-the-art functional language, namely Impala,
that offers explicit control over code refinement. To demonstrate our approach, we examine the description of image border handling for stencil functions and present elegant code descriptions. Besides, we show that the low-level
descriptions can easily be refined to high-level abstractions and serving software developers in the form of either a DSL or a library.
}, author = {Özkan, Mehmet Akif and Pérard-Gayot, Arsène and Membarth, Richard and Slusallek, Philipp and Teich, Jürgen and Hannig, Frank}, booktitle = {Proceedings of the Fifth International Workshop on FPGAs for Software Programmers}, date = {2018-08-31}, faupublication = {yes}, isbn = {978-3-8007-4723-8}, keywords = {DSL; HLS; AnyDSL; FPGA; Border Handling; Intel OpenCL}, peerreviewed = {unknown}, publisher = {VDE}, title = {{A} {Journey} into {DSL} {Design} using {Generative} {Programming}: {FPGA} {Mapping} of {Image} {Border} {Handling} through {Refinement}}, url = {https://www12.cs.fau.de/downloads/oezkan/publications/fsp18.pdf}, venue = {Dublin}, year = {2018} } @article{faucris.271549571, author = {Spieck, Jan and Wildermann, Stefan and Teich, Jürgen}, doi = {10.1145/3529230}, faupublication = {yes}, journal = {ACM Transactions on Design Automation of Electronic Systems}, pages = {4:1 - 4:40}, peerreviewed = {Yes}, title = {{A} {Learning}-{Based} {Methodology} for {Scenario}-{Aware} {Mapping} of {Soft} {Real}-{Time} {Applications} onto {Heterogeneous} {MPSoCs}}, volume = {28}, year = {2023} } @inproceedings{faucris.316726668, author = {Walter, Dominik and Brand, Marcel and Heidorn, Christian and Witterauf, Michael and Hannig, Frank and Teich, Jürgen}, booktitle = {Proceedings of the IEEE International Symposium on Circuits and Systems (ISCAS)}, date = {2024-05-19/2024-05-22}, faupublication = {yes}, peerreviewed = {Yes}, title = {{ALPACA}: {An} {Accelerator} {Chip} for {Nested} {Loop} {Programs}}, venue = {Singapore}, year = {2024} } @inproceedings{faucris.117233644, author = {Becher, Andreas and Echavarria Gutiérrez, Jorge Alfonso and Ziener, Daniel and Wildermann, Stefan and Teich, Jürgen}, booktitle = {Proceedings of the 24th Annual IEEE International Symposium on Field-Programmable Custom Computing Machines}, date = {2016-05-01/2016-05-03}, doi = {10.1109/FCCM.2016.16}, faupublication = {yes}, isbn = {978-1-5090-2356-1}, peerreviewed = {Yes}, publisher = {IEEE}, title = {{A} {LUT}-{Based} {Approximate} {Adder}}, venue = {Washington DC}, year = {2016} } @inproceedings{faucris.109668284, author = {Graf, Sebastian and Glaß, Michael and Teich, Jürgen and Platte, Daniel}, booktitle = {Proceedings of the Stuttgart International Symposium}, date = {2015-03-17/2015-03-18}, doi = {10.1007/978-3-658-08844-6{\_}14}, faupublication = {yes}, pages = {203-215}, peerreviewed = {unknown}, title = {{A} {Methodology} for the {Optimized} {Design} of an {E}/{E} {Architecture} {Component} {Platform}}, venue = {Stuttgart}, year = {2015} } @inproceedings{faucris.121665764, abstract = {High-level synthesis tools are gaining more and more acceptance in industrial design flows. While they increase productivity in implementing a single complex hardware module, synthesizing and optimizing many hardware components simultaneously is still an open problem. In particular, resource sharing is typically only performed for single components, thereby neglecting optimization possibilities across concurrent modules. On the other hand, domain-specific models and specifications, which are generally seen as a key ingredient to raise the level of abstraction in future design flows, may enable such global optimizations. In this paper, we present a model-based approach for inter-process resource sharing which provides for efficient high-level synthesis of streaming applications modeled as a set of communicating processes. The applicability of the proposed approach is validated by a case study. © 2012 ECSI.}, author = {Zebelein, Christian and Falk, Joachim and Haubelt, Christian and Teich, Jürgen}, booktitle = {2nd Electronic System Level Synthesis Conference, ESLsyn 2012}, date = {2012-06-02/2012-06-03}, faupublication = {yes}, isbn = {9782953998719}, pages = {17-22}, peerreviewed = {unknown}, title = {{A} model-based inter-process resource sharing approach for high-level synthesis of dataflow graphs}, url = {https://www.scopus.com/inward/record.url?partnerID=HzOxMe3b&scp=84866155902&origin=inward}, venue = {San Francisco, CA}, year = {2012} } @inproceedings{faucris.118033784, address = {Berlin}, author = {Teich, Jürgen and Slomka, Frank}, booktitle = {VDE/VDI-Gesellschaft Mikroelektronik, Mikro- und Feinwerktechnik (GMM), GMM-Fachbericht}, faupublication = {no}, note = {UnivIS-Import:2015-04-16:Pub.2003.tech.IMMD.inform.amodel}, pages = {91-96}, publisher = {VDE-Verlag}, series = {A Model for Buffer Exploration in EDF Scheduled Embedded Systems}, title = {{A} {Model} for {Buffer} {Exploration} in {EDF} {Scheduled} {Embedded} {Systems}}, venue = {Erlangen}, year = {2003} } @inproceedings{faucris.116798704, abstract = {The new High-Efficiency Video Coding (HEVC) standard achieves much better compression ratios than previous ones by offering multiple coding modes, albeit with a significant increase over the required computational power especially at the encoder side. As the first major contribution, we propose a fine-grained parallelization of the encoding mode decision process using a SystemC actor-based model, exploiting multi-core platforms. Second, based on this model, we analyze achievable speedups compared to the single core sequential implementation of the HM-16.0 reference software. Using four different video sequences, we find that our approach achieves an equivalent rate-distortion performance for different quantization parameter values with a simulated encoding time improvement factor of up to \(9\times \) for a maximally parallelized mode decision process. Third, an HEVC encoder has a huge number of different standard-complying encoding modes to choose from for each encoded frame, making the exploration space almost impossible to be fully covered by a brute-force search. Here, we systematically investigate the trade-off in encoding time versus required number of processor cores by proposing a multi-objective Design Space Exploration (DSE) of the mapping of the parallelized mode decision tasks to processing resources, taking as optimization objectives the resulting bitrate, image quality, number of processor cores used, execution time, and total energy consumptio}, address = {Berlin; Heidelberg}, author = {Rosales, Rafael and Herglotz, Christian and Glaß, Michael and Teich, Jürgen and Kaup, André}, booktitle = {In Proceedings of the International Conference on Architecture of Computing Systems (ARCS)}, date = {2016-04-04/2016-04-07}, doi = {10.1007/978-3-319-30695-7{\_}20}, editor = {Springer}, faupublication = {yes}, isbn = {9783319306940}, keywords = {Video encoding · HEVC · ESL · DSE · Parallelization · Actor-based modeling · Hardware/software co-design}, note = {UnivIS-Import:2016-06-01:Pub.2016.tech.IE.LEN.analys}, pages = {263-276}, peerreviewed = {Yes}, publisher = {Springer Verlag}, title = {{Analysis} and {Exploitation} of {CTU}-{Level} {Parallelism} in the {HEVC} {Mode} {Decision} {Process} {Using} {Actor}-based {Modeling}}, venue = {Nürnberg}, year = {2016} } @article{faucris.111379664, abstract = {In this paper, we consider the problem of analyzing dataflow programs with the property that actor production and consumption rates are not constant and fixed, but limited by intervals. Such interval ranges may result from uncertainty in the specification of an actor or as a design freedom of the model. Major questions such as consistencyand buffer memory requirementsfor single-processor scheduleswill be analyzed here for such specifications for the first time. Also, metamodeling formulations of interval limited dataflow are discussed, with special emphasis on the application to cyclo-static dataflow modeling. © 2006 Springer Science + Business Media, LLC.}, author = {Bhattacharyya, Shuvra S. and Teich, Jürgen}, doi = {10.1007/s11265-006-7274-2}, faupublication = {yes}, journal = {Journal of VLSI Signal Processing Systems for Signal, Image, and Video Technology}, keywords = {Buffer memory analysis; Interval rates; Scheduling dataflow models of computation}, note = {UnivIS-Import:2015-03-09:Pub.2006.tech.IMMD.inform.analys}, pages = {247-258}, peerreviewed = {unknown}, title = {{Analysis} of {Dataflow} {Programs} with {Interval}-limited {Data}-rates}, volume = {Vol. 43, Nos. 2-3}, year = {2006} } @inproceedings{faucris.118050944, abstract = {In this paper, we consider the problem of analyzing data flow programs with the property that actor production and consumption rates are not constant and fixed, but limited by intervals. Such interval ranges may result from uncertainty in the specification of an actor or as a design freedom of the model. Major questions such as consistency and buffer memory requirements for single-processor schedules will be analyzed here for such specifications for the first time. © Springer-Verlag Berlin Heidelberg 2004.}, address = {Berlin}, author = {Teich, Jürgen and Bhattacharyya, Shuvra S.}, booktitle = {Computer Systems: Architectures, Modeling, and Simulation}, date = {2004-07-19/2004-07-21}, faupublication = {yes}, note = {UnivIS-Import:2015-04-16:Pub.2004.tech.IMMD.inform.analys}, pages = {507-518}, publisher = {Springer-verlag}, series = {Lecture Notes in Computer Science (LNCS)}, title = {{Analysis} of {Dataflow} {Programs} with {Interval}-{Limited} {Data}-{Rates}}, venue = {Samos}, volume = {3133}, year = {2004} } @article{faucris.117073264, abstract = {Applications in the signal processing domain are often modeled by dataflow graphs. Due to heterogeneous complexity requirements, these graphs contain both dynamic and static dataflow actors. In previous work, we presented a generalized clustering approach for these heterogeneous dataflow graphs in the presence of unbounded buffers. This clustering approach allows the application of static scheduling methodologies for static parts of an application during embedded software generation for multiprocessor systems. It systematically exploits the predictability and efficiency of the static dataflow model to obtain latency and throughput improvements. In this article, we present a generalization of this clustering technique to dataflow graphs with bounded buffers, therefore enabling synthesis for embedded systems without dynamic memory allocation. Furthermore, a case study is given to demonstrate the performance benefits of the approach. © 2010 ACM.}, author = {Falk, Joachim and Zebelein, Christian and Keinert, Joachim and Haubelt, Christian and Teich, Jürgen and Bhattacharyya, Shuvra S.}, doi = {10.1145/1880050.1880054}, faupublication = {yes}, journal = {ACM Transactions on Embedded Computing Systems}, keywords = {Actor-oriented design; Clustering; Data flow analysis; Scheduling}, peerreviewed = {Yes}, title = {{Analysis} of systemc actor networks for efficient synthesis}, volume = {10}, year = {2010} } @inproceedings{faucris.120568844, address = {Berlin}, author = {Graf, Sebastian and Streubühr, Martin and Glaß, Michael and Teich, Jürgen}, booktitle = {Proceedings of the Automotive meets Electronics (AmE2011), GMM Fachbericht 69}, faupublication = {yes}, isbn = {978-3-8007-3345-3}, note = {UnivIS-Import:2015-04-16:Pub.2011.tech.IMMD.inform.analyz}, pages = {10-15}, publisher = {VDE VERLAG}, title = {{Analyzing} {Automotive} {Networks} using {Virtual} {Prototypes}}, venue = {Dortmund, Germany}, year = {2011} } @inproceedings{faucris.239199786, author = {Echavarria Gutiérrez, Jorge Alfonso and Wildermann, Stefan and Khosravi, Faramarz and Teich, Jürgen}, booktitle = {AxC20: 5th Workshop on Approximate Computing}, date = {2020-07-19/2020-07-24}, faupublication = {yes}, keywords = {Approximate Sequential Multiplier}, peerreviewed = {Yes}, title = {{An} {Approximate} {Sequential} {Multiplier} with {Segmented} {Carry} {Chain} and {Variable} {Accuracy}}, venue = {San Francisco, CA}, year = {2020} } @inproceedings{faucris.110016324, author = {Kupriyanov, Olexiy and Hannig, Frank and Kissler, Dmitrij and Teich, Jürgen and Schaffer, Rainer and Merker, Renate}, booktitle = {Proceedings of the 9th ITG/GMM/GI Workshop, Methoden und Beschreibungssprachen zur Modellierung und Verifikation von Schaltungen und Systemen}, date = {2006-02-20/2006-02-22}, faupublication = {yes}, pages = {11-20}, peerreviewed = {unknown}, title = {{An} {Architecture} {Description} {Language} for {Massively} {Parallel} {Processor} {Architectures}}, venue = {Dresden}, year = {2006} } @inproceedings{faucris.118485444, abstract = {In premium vehicles, the number of distributed comfort-, safety-, and infotainment-related functions is steadily increasing. For this reason, the requirements for the underlying communication architecture are also becoming stronger. In addition, the diversity of todays deployed communication technologies and the need for higher bandwidths complicate the design of future network architectures. Ethernet and IP, both standardized and widely used, could be one solution to homogenize communication architectures and to provide higher bandwidths. This paper focuses on a migration concept for replacing todays employed CAN-buses by Ethernet/IP-based networks. It highlights several concepts to minimize the protocol header overhead by using EA- and rule-based algorithms and presents migration results for currently deployed automotive CAN subnetworks. © 2011 EDAA.}, address = {New York, NY, USA}, author = {Kern, Andreas and Streichert, Thilo and Teich, Jürgen}, booktitle = {Proc. of DATE}, date = {2011-03-14/2011-03-18}, faupublication = {yes}, isbn = {978-1-61284-208-0}, keywords = {automotive; CAN; CANoverIP; embedded; Ethernet; IP; migration; optimization; UDP; XoverIP}, note = {UnivIS-Import:2015-04-16:Pub.2011.tech.IMMD.inform.anauto}, pages = {112-117}, publisher = {IEEE Press}, title = {{An} {Automated} {Data} {Structure} {Migration} {Concept} - {From} {CAN} to {Ethernet}/{IP} in {Automotive} {Embedded} {Systems} ({CANoverIP})}, url = {http://ieeexplore.ieee.org/xpls/abs{\_}all.jsp?arnumber=5763027}, venue = {Grenoble}, year = {2011} } @inproceedings{faucris.110177144, author = {Schmidt, Bernhard and Ziener, Daniel and Teich, Jürgen}, booktitle = {In Proceedings of 2014 ACM/SIGDA International Symposium on Field-Programmable Gate Arrays, FPGA '14}, date = {2014-02-26/2014-02-28}, faupublication = {yes}, isbn = {978-1-4503-2671-1}, pages = {257-257}, peerreviewed = {unknown}, title = {{An} automatic netlist and floorplanning approach to improve the {MTTR} of scrubbing techniques}, venue = {Monterey, CA}, year = {2014} } @inproceedings{faucris.250542509, abstract = {

Border handling is a crucial step in many image processing applications. For stencil kernels such as the Gaussian filter where a window of pixels is required to compute an output pixel, the border of the image needs to be handled differently than the body of the image. To prevent out-of-bounds accesses, conditional statements need to be inserted into the pixel address calculation. This introduces significant overhead, especially on hardware accelerators such as GPUs. Existing research efforts mostly focus on image body computations, while neglecting the importance of border handling or treating it as a corner case. In this paper, we propose an efficient border handling approach for GPUs. Our approach is based on iteration space partitioning, which is a technique similar to index-set splitting, a well-known general-purpose compiler optimization. We present a detailed systematic analysis including an analytic model that quantitatively evaluates the benefits as well as the costs of the transformation. In addition, manually implementing the border handling technique is a tedious task and not portable at all. We integrate our approach into an image processing DSL and a source-to-source compiler called Hipacc to relieve the burden and increase programmers’ productivity. We evaluate over five commonly used image process- ing applications on two Nvidia GPUs. Results show our proposed approach achieves a geometric mean speedup of up to 87% over a naive implementation.

}, author = {Qiao, Bo and Teich, Jürgen and Hannig, Frank}, booktitle = {Proceedings of the 2021 IEEE International Parallel and Distributed Processing Symposium Workshops (IPDPSW)}, date = {2021-05-17/2021-05-21}, doi = {10.1109/IPDPSW52791.2021.00067}, faupublication = {yes}, isbn = {978-1-6654-3577-2}, keywords = {Border Handling, Image Processing, GPU, DSL}, pages = {387-396}, peerreviewed = {Yes}, title = {{An} {Efficient} {Approach} for {Image} {Border} {Handling} on {GPUs} via {Iteration} {Space} {Partitioning}}, venue = {Portland, OR}, year = {2021} } @inproceedings{faucris.117074144, abstract = {Importance measure analysis judges the relative importance of components in a system and reveals how each component contributes to the system reliability. In the design of large and complex systems, importance measure analysis can therefore be employed to guide an optimization tool which design decisions to investigate to gain higher reliability. While previous research has mainly concentrated on developing analytical importance measure techniques, the automatic and frequent computing of importance measures as required in the context of design space exploration has got very few, if any attention. This paper presents a highly efficient technique to compute the reliability and structural importance measures of components of a system. The proposed technique considers the reliability of a system implementation and subsequently analyzes the importance measures of its components based on a state-of-the-art Monte Carlo simulation. The technique can therefore estimate the importance measures of all components concurrently, highly improving the performance of the computation compared, e. g., to the well-known Birnbaum approach by the factor of 2n with n being the number of components. Moreover, we show how this algorithm can be extended to support importance measure analysis in the existence of transient faults which is essential since in future systems, transient faults are expected to cause relatively more failures than permanent faults. We integrated the proposed analysis approach in an existing multi-objective local-search algorithm that is part of an automatic system-level design space exploration which seeks for system implementations with highest reliability at lowest possible cost. Experimental results show that the proposed algorithm performs efficiently with negligible imprecision, even for large realworld examples. Copyright 2014 ACM.}, author = {Aliee, Hananeh and Glaß, Michael and Khosravi, Faramarz and Teich, Jürgen}, booktitle = {Proceedings of the International Conference on Hardware/Software Codesign and System Synthesis (CODES+ISSS 2014)}, date = {2014-10-12/2014-10-17}, doi = {10.1145/2656075.2656079}, faupublication = {yes}, isbn = {9781450330510}, pages = {34:1-34:10}, peerreviewed = {unknown}, publisher = {Association for Computing Machinery, Inc}, title = {{An} efficient technique for computing importance measures in automatic design of dependable embedded systems}, venue = {New Delhi}, year = {2014} } @inproceedings{faucris.118771444, abstract = {Software systems are becoming increasingly complex, requiring a deep knowledge to work and program with them. This is especially true for simulation frameworks used by scientists and engineers, but also applies to completely different domains such as mobile or web applications. To ease working with these systems, domain-specific languages (DSLs) are a convenient way to enable domain experts describe settings and problems they want to solve using terms familiar to them. Building upon this specification in the DSL, a compiler transform this to the target software framework, e.,g., runnable program code. To write such a compiler, a solid implementation framework is needed. In this paper, we propose criteria for the evaluation of textual programming language implementation frameworks to which we accordingly evaluate four technologies, namely Spoofax/IMP, Rascal MPL, a custom approach using C++ and a custom approach using Scal}, address = {New York, NY, USA}, author = {Schmitt, Christian and Kuckuk, Sebastian and Köstler, Harald and Hannig, Frank and Teich, Jürgen}, booktitle = {Proc. of the 14th International Conference on Computational Science and its Applications (ICCSA)}, date = {2014-06-30/2014-07-03}, doi = {10.1109/ICCSA.2014.16}, faupublication = {yes}, isbn = {978-1-4799-4264-0}, note = {UnivIS-Import:2015-04-16:Pub.2014.tech.IMMD.inform.aneval}, pages = {18-26}, peerreviewed = {Yes}, publisher = {IEEE Press}, title = {{An} {Evaluation} of {Domain}-{Specific} {Language} {Technologies} for {Code} {Generation}}, venue = {Minho, Guimaraes}, year = {2014} } @inproceedings{faucris.117061604, author = {Blickle, Tobias and Teich, Jürgen and Thiele, Lothar}, booktitle = {Proc. of Codes/CASHE'97, the 5th Int. Workshop on Hardware/Software Co-design, Braunschweig, Germany, pp. 167-171, March 1997}, faupublication = {no}, pages = {167-171}, peerreviewed = {unknown}, title = {{An} {Evolutionary} {Approach} to {System}-{Level} {Synthesis}}, venue = {Braunschweig}, year = {1997} } @inproceedings{faucris.121433884, author = {Teich, Jürgen and Blickle, Tobias and Thiele, Lothar}, booktitle = {Proc. of WSC1, the 1st Online Workshop on Soft Computing}, faupublication = {no}, note = {UnivIS-Import:2015-04-16:Pub.1996.tech.IMMD.inform.anevol}, pages = {251-256}, title = {{An} evolutionary approach to system-level synthesis}, venue = {Nagoya, Japan}, year = {1996} } @inproceedings{faucris.116451544, abstract = {By increasing the amount of resources on reconfigurable platforms with the abililty of partial reconfigurability, the issues of the management of these resources and their sharing among different tasks will become more of a concern. Online placement is one of these management issues that is investigated in this paper. Here we present a new approach for online placement of modules on reconfigurable devices, by managing the occupied space rather the free space on the device. Also an optimization of communication between running modules themselves and outside of the chip is proposed. The experimental results show a considerable decrease in communication and routing costs.}, author = {Ahmadinia, Ali and Bobda, Christophe and Bednara, Marcus and Teich, Jürgen}, booktitle = {Proc. of the International Parallel and Distributed Processing Symposium (IPDPS-2004), Reconfigurable Architectures Workshop}, date = {2004-04-26/2004-04-30}, faupublication = {yes}, isbn = {0-7695-2132-0}, note = {UnivIS-Import:2015-04-16:Pub.2004.tech.IMMD.inform.anewap}, title = {{A} {New} {Approach} for {On}-line {Placement} on {Reconfigurable} {Devices}}, venue = {Santa Fé NM}, year = {2004} } @inproceedings{faucris.118034004, abstract = {We present a new approach for reconfigurable massively parallel computers. The approach uses FPGA as reconfigurable device to build parallel computers which can adapt their physical topology to match the virtual topology used to model the parallel computation paradigm of a given application. We use a case study in which a virtual ring topology is first simulated on a tree topology and then directly implemented in an FPGA configuration. Preliminary results show that we can increase the performance of the parallel computers which make use of message passing interface by afactor of up to 20 % if a reconfigurable topology approach is used.}, author = {Ahmadinia, Ali and Bobda, Christophe and Danne, Klaus and Teich, Jürgen}, booktitle = {Proceedings of the IEEE International Conference on Field-Programmable Technology}, date = {2003-12-15/2003-12-17}, doi = {10.1109/FPT.2003.1275784}, faupublication = {yes}, isbn = {9780780383203}, note = {UnivIS-Import:2015-04-16:Pub.2003.tech.IMMD.inform.anewap{\_}5}, pages = {391-394}, peerreviewed = {unknown}, publisher = {Institute of Electrical and Electronics Engineers Inc.}, series = {A New Approach for Reconfigurable Massively Parallel Computers}, title = {{A} {New} {Approach} for {Reconfigurable} {Massively} {Parallel} {Computers}}, venue = {Tokyo}, year = {2003} } @misc{faucris.109487004, author = {Teich, Jürgen and Thiele, Lothar}, faupublication = {no}, peerreviewed = {automatic}, title = {{A} new approach to solving resource-constrained scheduling problems based on a flow-model}, year = {1996} } @article{faucris.117214504, author = {Borgonovo, Emanuele and Aliee, Hananeh and Glaß, Michael and Teich, Jürgen}, doi = {10.1016/j.ejor.2016.03.054}, faupublication = {yes}, journal = {European Journal of Operational Research}, peerreviewed = {Yes}, title = {{A} {New} {Time}-{Independent} {Reliability} {Importance} {Measure}}, year = {2016} } @inproceedings{faucris.287395588, abstract = {Big Data applications frequently involve processing data streams encoded in semi-structured data formats such as JSON, Protobuf, or Avro. A major challenge in accelerating data stream processing on FPGAs is that the parsing of such data formats is usually highly complex. This is especially true for JSON parsing on FPGAs, which lies in the focus of related work. The parsing of the binary Avro format, on the other hand, is perfectly suited for being processed on FPGAs and can thus serve as an enabler for data stream processing on FPGAs. In this realm, we present a methodology for parsing, projection, and selection of Avro objects, which enforces an output format suitable for further processing on the FPGA. Moreover, we provide a generator to automatically create accelerators based on this methodology. The obtained accelerators can achieve significant speedups compared to CPU-based parsers, and at the same time require only very few FPGA resources.

In this paper, we present AnyHLS, an approach to synthesize FPGA designs in a modular and abstract way. AnyHLS is able to raise the abstraction level of existing HLS tools by resorting to modern programming language features such as types and higher-order functions as follows: First, partial evaluation is used to specialize and to optimize the user application based on a library of abstractions. Finally, the backend of AnyHLS generates vendor-specific HLS code for Intel and Xilinx FPGAs. Portability is obtained by avoiding any vendor-specific pragmas at the source code. In order to validate achievable gains in productivity, a library for the domain of image processing is introduced as a case study, and its synthesis results are compared with several state-of-the-art DSL approaches for this domai}, author = {Özkan, Mehmet Akif and Pérard-Gayot, Arsène and Membarth, Richard and Slusallek, Philipp and Leißa, Roland and Hack, Sebastian and Teich, Jürgen and Hannig, Frank}, booktitle = {International Conference on Hardware/Software Codesign and System Synthesis (CODES+ISSS)}, date = {2020-09-20/2020-09-25}, doi = {10.1109/tcad.2020.3012172}, faupublication = {yes}, peerreviewed = {Yes}, title = {{AnyHLS}: {High}-{Level} {Synthesis} with {Partial} {Evaluation}}, url = {https://arxiv.org/pdf/2002.05796.pdf}, venue = {Hamburg}, year = {2020} } @article{faucris.240175486, abstract = {

FPGAs excel in low power and high throughput computations, but they are challenging to program. Traditionally, developers rely on hardware description languages like Verilog or VHDL to specify the hardware behavior at the register-transfer level. High-Level Synthesis (HLS) raises the level of abstraction but still requires FPGA design knowledge. Programmers usually write pragma-annotated C/C++ programs to define the hardware architecture of an application. However, each hardware vendor extends its own C dialect using its own vendor-specific set of pragmas. This prevents portability across different vendors. Furthermore, pragmas are not first-class citizens in the language. This makes it hard to use them in a modular way or design proper abstractions.

}, author = {Özkan, Mehmet Akif and Pérard-Gayot, Arsène and Membarth, Richard and Slusallek, Philipp and Leißa, Roland and Hack, Sebastian and Teich, Jürgen and Hannig, Frank}, doi = {10.1109/TCAD.2020.3012172}, faupublication = {yes}, journal = {IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems}, pages = {3202-3214}, peerreviewed = {Yes}, title = {{AnyHLS}: {High}-{Level} {Synthesis} with {Partial} {Evaluation}}, url = {https://arxiv.org/pdf/2002.05796.pdf}, volume = {39}, year = {2020} } @inproceedings{faucris.239132293, abstract = {In this paper, we present anytime instructions for floating-

point additions and multiplications. Specific to such instructions is

their ability to compute an arithmetic operation at a programmable

accuracy of a most significant bits where a is encoded in the instruction

itself. Contrary to reduced-precision architectures, the word length is

maintained throughout the execution. Two approaches are presented for

the efficient implementation of anytime additions and multiplications, one

based on on-line arithmetic and the other on bitmasking. We propose

implementations of anytime functional units for both approaches and

evaluate them in terms of error, latency, area, as well as energy savings.

As a result, 15% of energy can be saved on average while computing a

floating-point addition with an error of less than 0.1%. Moreover, large

latency and energy savings are reported for iterative algorithms such as

a Jacobi algorithm with savings of up to 39% in energy.

}, author = {Brand, Marcel and Witterauf, Michael and Bosio, Alberto and Teich, Jürgen}, booktitle = {Proceedings of the 31st IEEE International Conference on Application-specific Systems, Architectures and Processors}, date = {2020-07-06/2020-07-08}, doi = {10.1109/ASAP49362.2020.00034}, faupublication = {yes}, peerreviewed = {unknown}, title = {{Anytime} {Floating}-{Point} {Addition} and {Multiplication} – {Concepts} and {Implementations}}, venue = {Manchester, U.K.}, year = {2020} } @inproceedings{faucris.213198267, author = {Brand, Marcel and Witterauf, Michael and Hannig, Frank and Teich, Jürgen}, booktitle = {ACM International Conference on Computing Frontiers 2019}, date = {2019-04-30/2019-05-02}, doi = {10.1145/3310273.3322833}, editor = {ACM}, faupublication = {yes}, isbn = {978-1-4503-6685-4}, pages = {215 - 219}, peerreviewed = {Yes}, title = {{Anytime} {Instructions} for {Programmable} {Accuracy} {Floating}-{Point} {Arithmetic}}, venue = {Alghero, Sardinia}, year = {2019} } @article{faucris.117311964, author = {Aliee, Hananeh and Glaß, Michael and Chen, Liang and Ebrahimi, Mojtaba and Khosravi, Faramarz and Kleeberger, Veit B. and Listl, Alexandra and Müller-Gritschneder, Daniel and Oboril, Fabian and Schlichtmann, Ulf and Tahoori, Mehdi B. and Teich, Jürgen and Wehn, Norbert and Weis, Christian}, doi = {10.1515/itit-2014-1080}, faupublication = {yes}, journal = {it - Information Technology}, pages = {159-169}, peerreviewed = {Yes}, title = {{Application}-aware cross-layer reliability analysis and optimization}, volume = {57}, year = {2015} } @inproceedings{faucris.109501304, abstract = {The growing demand of computationally intensive algorithms/applications has resulted in the widespread acceptance of heterogeneous MPSoC platforms. The primary reason for this trend is due to the better performance and power efficiency exhibited by heterogeneous architectures consisting of standard processor cores and hardware accelerators. However, multiple processors accessing shared resources such as cache/memory and buses may lead to significant contention on them, thereby decreasing not only the performance, but also timing predictability. Moreover, the effect of shared resource contention worsens in the presence of multiple application scenarios with different execution and communication bandwidth requirements. To mitigate this problem, we first propose a Dynamic Bus Reconfiguration Policy (DBRP) that decides when to reconfigure a shared bus between Non-Preemptive Fixed Priority (NP-FP) and Time-Division Multiple Access (TDMA) scheduling. The required TDMA slot sizes are computed on-the-fly before NP-FP to TDMA reconfiguration such that deadlines of all Hard Real-Time (HRT) applications are satisfied and all Soft Real-Time (SRT) applications are serviced evenly. Our proposed DBRP has been implemented on a real MPSoC platform consisting of cores connected by the AMBA AHB. The case studies demonstrate that reconfiguration of bus arbitration ensures that communication deadline constraints of HRT applications are maximally satisfied with low hardware and reconfiguration overhead.}, author = {Gangadharan, Deepak and Sousa, Éricles and Lari, Vahid and Hannig, Frank and Teich, Jürgen}, booktitle = {Proceedings of Asilomar Conference on Signals, Systems, and Computers (ASILOMAR)}, date = {2014-11-02/2014-11-05}, doi = {10.1109/ACSSC.2014.7094471}, faupublication = {yes}, isbn = {9781479982974}, pages = {398-403}, peerreviewed = {unknown}, publisher = {IEEE Computer Society}, title = {{Application}-driven reconfiguration of shared resources for timing predictability of {MPSoC} platforms}, venue = {Pacific Grove, CA}, year = {2015} } @inproceedings{faucris.119603924, address = {Paderborn, Germany}, author = {Becher, Andreas and Echavarria Gutiérrez, Jorge Alfonso and Ziener, Daniel and Teich, Jürgen}, booktitle = {AxC15: 1st Workshop on Approximate Computing}, faupublication = {yes}, note = {UnivIS-Import:2017-12-18:Pub.2015.tech.IMMD.inform.approx}, peerreviewed = {Yes}, publisher = {Universität Paderborn}, title = {{Approximate} {Adder} {Structures} on {FPGAs}}, venue = {Paderborn, Germany}, year = {2015} } @inproceedings{faucris.245234948, author = {Echavarria Gutiérrez, Jorge Alfonso and Wildermann, Stefan and Teich, Jürgen}, booktitle = {Design, Automation and Test in Europe, DATE 2021}, date = {2021-02-01/2021-02-05}, doi = {10.23919/date51398.2021.9473952}, faupublication = {yes}, peerreviewed = {Yes}, publisher = {IEEE Computer Society}, title = {{Approximate} {Logic} {Synthesis} of {Very} {Large} {Boolean} {Networks}}, venue = {Alpexpo, Grenoble}, year = {2021} } @inproceedings{faucris.118569704, abstract = {The design and the programming of heterogeneous future MPSoCs including thousands of processor cores is a hard challenge. Means are necessary to program and simulate the dynamic behavior of such systems in order to dimension the hardware design and to verify the software functionality as well as performance goals. Cycle-accurate simulation of multiple parallel applications simultaneously running on different cores of the architecture would be much too slow and is not the desired level of detail. In this paper, we therefore present a novel high-level simulation approach which tackles the complexity and the heterogeneity of such systems and enables the investigation of a new computing paradigm called invasive computing. Here, the workload and its distribution are not known at compile-time but are highly dynamic and have to be adapted to the status (load, temperature, etc.) of the underlying architecture at run-time. We propose an approach for the modeling of tiled MPSoC architectures and the simulation of resource-aware programming concepts on these. This approach delivers important timing information about the parallel execution and also is taking into account the computational properties of possibly different types of cores. © 2012 IEEE.}, address = {New York, NY, USA}, author = {Roloff, Sascha and Hannig, Frank and Teich, Jürgen}, booktitle = {Proc. of the 17th Asia and South Pacific Design Automation Conference (ASP-DAC)}, date = {2012-01-30/2012-02-02}, doi = {10.1109/ASPDAC.2012.6164943}, faupublication = {yes}, isbn = {978-1-4673-0770-3}, note = {UnivIS-Import:2015-04-16:Pub.2012.tech.IMMD.inform.approx}, pages = {187-192}, publisher = {IEEE Press}, title = {{Approximate} {Time} {Functional} {Simulation} of {Resource}-{Aware} {Programming} {Concepts} for {Heterogeneous} {MPSoCs}}, venue = {Sydney}, year = {2012} } @inproceedings{faucris.118074484, abstract = {Management of communication by on-line routing in new FPGAs with a large amount of logic resources and partial reconfigurability is a new challenging problem. A Network-on-Chip (NoC) typically uses packet routing mechanism, which has often unsafe data transfers, and network interface overhead. In this paper, circuit routing for such dynamic NoCs is investigated, and a practical 1-dimensional network with an efficient routing algorithm is proposed and implemented. Also, this concept has been extended to the 2-dimensional case. The implementation results show the low area overhead and high performance of this network. © 2005 IEEE.}, author = {Ahmadinia, Ali and Bobda, Christophe and Ding, Ji and Fekete, Sandor P. and Majer, Mateusz and Teich, Jürgen and Van Der Veen, Jan C.}, booktitle = {Proceedings of the 16th IEEE International Workshop on Rapid System Prototyping}, date = {2005-06-08/2005-06-10}, faupublication = {yes}, note = {UnivIS-Import:2015-04-16:Pub.2005.tech.IMMD.inform.apract}, pages = {84-90}, publisher = {Institute of Electrical and Electronics Engineers}, title = {{A} {Practical} {Approach} for {Circuit} {Routing} on {Dynamic} {Reconfigurable} {Devices}}, venue = {Montreal}, year = {2005} } @inproceedings{faucris.106593344, abstract = {Power consumption is a key challenge for LTE-Advanced or future 5G mobile devices and current power management systems successfully achieve significant power savings. However, these systems are driven by static rules and provide a posteriori responses to traffic and context changes. In this paper, we propose a smart dynamic power management system for cellular modems, extending existing power saving mechanisms by using machine learning-based traffic prediction. With the a priori knowledge of specific scheduling messages, internal device
parameters can be finely tuned to improve the modem power consumption. In order to accurately estimate the power saving potential of several LTE use cases, we build a relevant data set of live network modem traces, as well as a power model of the baseband physical layer and radio frequency components. Subsequently, we propose an evaluation methodology and apply
it to analyze the predictive power management performance in terms of error rate and global power consumption outcome. Our analysis results in maximal power savings of 12% for meaningful traffic scenarios as well as the identification of variables of interest to improve the proposed power manage}, author = {Ah Sue, Jonathan and Brand, Peter and Brendel, Johannes and Hasholzner, Ralph and Falk, Joachim and Teich, Jürgen}, booktitle = {2018 IEEE Wireless Communications and Networking Conference (WCNC'18)}, date = {2018-04-15/2018-04-18}, doi = {10.1109/WCNC.2018.8377189}, editor = {IEEE}, faupublication = {yes}, isbn = {978-1-5386-1734-2}, peerreviewed = {Yes}, title = {{A} {Predictive} {Dynamic} {Power} {Management} for {LTE}-{Advanced} {Mobile} {Devices}}, venue = {Barcelona, Catalonia, Spain}, year = {2018} } @inproceedings{faucris.118704124, abstract = {Continuous software and hardware innovations impose on the one hand a high degree of flexibility from an algorithm and on the other hand it requires that a given processing architecture has the capability to adapt to changing computation patterns at run-time. In this work, we demonstrate how a computer vision application can adapt itself at runtime in order to satisfy different requirements of quality and throughput. For that, we consider an implementation of the Harris Corner Detector on an MPSoC (Multi-Processor System-on-Chip) architecture composed of a quad-core RISC processor and one accelerator based on a programmable massively parallel processor array.}, address = {New York, NY, USA}, author = {Sousa, Éricles and Tanase, Alexandru-Petru and Hannig, Frank and Teich, Jürgen}, booktitle = {Proc. 2013 Conference on Design and Architectures for Signal and Image Processing}, date = {2013-10-08/2013-10-10}, faupublication = {yes}, isbn = {979-10-92279-02-3}, note = {UnivIS-Import:2015-04-16:Pub.2013.tech.IMMD.inform.aproto}, pages = {361-362}, publisher = {IEEE Press}, title = {{A} {Prototype} of an {Adaptive} {Computer} {Vision} {Algorithm} on {MPSoC} {Architecture}}, venue = {Cagliari}, year = {2013} } @inproceedings{faucris.108646824, abstract = {Invasive computing is a novel paradigm for the exploitation of runtime parallelism of future MPSoC architectures through resource-aware programming and dynamic reconfiguration of the underlying architectures. Based on the state and availability of resources, an invasive algorithm organizes its computation itself. A highly-parameterizable weakly programmable VLIW processor array called Tightly-Coupled Processor Array (TCPA) that is augmented with decentralized units for hardware-accelerated resource exploration is well suited for invasive computing, especially for accelerating computationally intensive loop programs. We present a prototype of an invasive TCPA, which demonstrates the benefits of invasive computing using the example of video processing. More specifically, the targeted applications in this prototype are 2D convolution filters (such as edge detection and Gaussian filtering) that are applied on a real-time input video stream. For different workload scenarios and resource competition between applications, a varying quality of the output video streams is shown. © 2012 ECSI.}, address = {New York, NY, USA}, author = {Muddasani, Shravan and Boppu, Srinivas and Hannig, Frank and Kuzmin, Boris and Lari, Vahid and Teich, Jürgen}, booktitle = {Proc. of the 2012 Conference on Design and Architectures for Signal and Image Processing (DASIP)}, date = {2012-10-23/2012-10-25}, faupublication = {yes}, isbn = {978-1-4673-2089-4}, note = {UnivIS-Import:2015-04-16:Pub.2012.tech.IMMD.inform.aproto}, pages = {393-394}, publisher = {IEEE Press}, title = {{A} {Prototype} of an {Invasive} {Tightly}-{Coupled} {Processor} {Array}}, venue = {Karlsruhe}, year = {2012} } @inproceedings{faucris.118208024, abstract = {Static and dynamic variations, which have negative impact on the reliability of microelectronic systems, increase with smaller CMOS technology. Thus, further downscaling is only profitable if the costs in terms of area, energy and delay for reliability keep within limits. Therefore, the traditional worst case design methodology will become infeasible. Future architectures have to be error resilient, i.e., the hardware architecture has to tolerate autonomously transient errors. In this paper, we present an FPGA based rapid prototyping system for multi-processor systems-on-chip composed of autonomous hardware units for error-resilient processing and interconnect. This platform allows the fast architectural exploration of various error protection techniques under different failure rates on the microarchitectural level while keeping track of the system behavior. We demonstrate its applicability on a concrete wireless communication system. © 2010 EDAA.}, author = {May, Matthias and Wehn, Norbert and Bouajila, Abdelmajid and Zeppenfeld, Johannes and Stechele, Walter and Herkersdorf, Andreas and Ziener, Daniel and Teich, Jürgen}, booktitle = {Proc. Design, Automation and Test in Europe}, date = {2010-03-08/2010-03-12}, faupublication = {yes}, isbn = {978-1-4244-7054-9}, note = {UnivIS-Import:2015-04-16:Pub.2010.tech.IMMD.inform.arapid}, pages = {375-380}, title = {{A} {Rapid} {Prototyping} {System} for {Error}-{Resilient} {Multi}-{Processor} {Systems}-on-{Chip}}, venue = {Dresden}, year = {2010} } @article{faucris.120121804, author = {Fischer, Dirk and Teich, Jürgen and Thies, Michael and Weper, Ralph}, faupublication = {no}, journal = {International Journal of Circuits, Systems and Signal Processing}, note = {UnivIS-Import:2015-03-09:Pub.2002.tech.IMMD.inform.archit}, peerreviewed = {unknown}, title = {{Architecture}/{Compiler} {Co}-{Exploration} for {ASIPs}}, year = {2002} } @inproceedings{faucris.111771264, author = {Richthammer, Valentina and Schwarzer, Tobias and Wildermann, Stefan and Teich, Jürgen and Glaß, Michael}, booktitle = {55th ACM/EDAC/IEEE Design Automation Conference (DAC 2018)}, date = {2018-06-24/2018-06-28}, doi = {10.1109/DAC.2018.8465811}, faupublication = {yes}, peerreviewed = {unknown}, title = {{Architecture} {Decomposition} in {System} {Synthesis} of {Heterogeneous} {Many}-{Core} {Systems}}, venue = {San Francisco, CA}, year = {2018} } @book{faucris.119368304, author = {Hannig, Frank and Cardoso, João and Fey, Dietmar and Schröder-Preikschat, Wolfgang and Teich, Jürgen}, doi = {10.1007/978-3-319-30695-7}, faupublication = {yes}, isbn = {9783319306940}, peerreviewed = {unknown}, publisher = {Springer Verlag}, title = {{Architecture} of computing systems – {ARCS} 2016: 29th international conference {Nuremberg}, {Germany}, {April} 4-7, 2016 {Proceedings}}, volume = {9637}, year = {2016} } @inproceedings{faucris.118193724, abstract = {In this paper, we introduce a constraint programming-based approach for the optimization of area and of reconfiguration time for communication networks for a class of regular 2D reconfigurable processor array architectures. For a given set of different algorithms the execution of which is supposed to be switched upon request at run-time, we provide static solutions for the optimal routing of data between processors. Here, we support also multi-casting data transfers for the first time. The routing found by our method minimizes the area or the reconfiguration time of the communication network, when switching between the execution of these algorithms. In fact, when switching, the communication network reconfiguration can be executed in just a few clock cycles. Moreover the communication network area can be minimized significantly (62% in average). ©2008 IEE}, address = {New York}, author = {Wolinski, Christophe and Kuchcinski, Krzysztof and Teich, Jürgen and Hannig, Frank}, booktitle = {Proceedings of the International Conference on Field Programmable Logic and Applications}, date = {2008-09-08/2008-09-10}, doi = {10.1109/FPL.2008.4629969}, faupublication = {yes}, note = {UnivIS-Import:2015-04-16:Pub.2008.tech.IMMD.inform.areaan}, pages = {391-396}, publisher = {IEEE Press}, title = {{Area} and {Reconfiguration} {Time} {Minimization} of the {Communication} {Network} in {Regular} {2D} {Reconfigurable} {Architectures}}, venue = {Heidelberg}, year = {2008} } @inproceedings{faucris.118149944, author = {Sousa, Éricles and Tanase, Alexandru-Petru and Hannig, Frank and Teich, Jürgen}, booktitle = {International Conference on ReConFigurable Computing and FPGA's (ReConFig)}, date = {2017-12-04/2017-12-06}, doi = {10.1109/RECONFIG.2017.8279768}, faupublication = {yes}, peerreviewed = {Yes}, title = {{A} {Reconfigurable} {Memory} {Architecture} for {System} {Integration} of {Coarse}-{Grained} {Reconfigurable} {Arrays}}, url = {http://ieeexplore.ieee.org/document/8279768/}, venue = {Cancun, Mexico}, year = {2017} } @inproceedings{faucris.117709944, abstract = {Electronic System Level (ESL) tools are becoming more and more important in order to bridge the well-known productivity design gap. This panel brings together specialists from industry and ESL tool houses to discuss whether current ESL tools available are usable and meeting the requirements of the industry. In particular, we would like to know to what degree currently available tools from major ESL tool houses are used for designing what kind of systems in industry, and for what purpose such as analysis, design space exploration, or synthesis, etc. these tools are currently used. Also, we would like to discuss to what degree existing tools can help in reducing design time, and finally, what the industry sees the most challenging features of tools currently not yet available. On the other side, we would like to know the opinion of the ESL tool houses what they see the most important offerings of ESL tools currently available as well as the most challenging points and problems why ESL hasn't really taken off so far. Could this either be an industry issue, a methodology issue, a design issue or a tool issue. © Copyright 2006 ACM.}, address = {New York, NY, USA}, author = {Teich, Jürgen}, booktitle = {CODES ISSS '06: Proceedings of the 4th international conference on Hardware/software codesign and system synthesis}, date = {2006-10-22/2006-10-25}, doi = {10.1145/1176254.1176295}, faupublication = {yes}, keywords = {Behavioral and system synthesis; Electronic design automation; Electronic System Level (ESL) design}, note = {UnivIS-Import:2015-04-16:Pub.2006.tech.IMMD.inform.arecur}, pages = {166}, publisher = {ACM Press}, title = {{Are} {Current} {ESL} {Tools} {Meeting} the {Requirements} of {Advanced} {Embedded} {Systems}?}, venue = {Seoul}, year = {2006} } @incollection{faucris.214436081, abstract = {In this article, an efficient rule-based clustering algorithm for static dataflow subgraphs in a dynamic dataflow graph is presented. The clustered static dataflow actors are quasi-statically scheduled, in such a way that the global performance in terms of latency and throughput is improved compared to a dynamically scheduled execution, while avoiding the introduction of deadlocks as generated by naive static scheduling approaches. The presented clustering algorithm outperforms previously published approaches by a faster computation and more compact representation of the derived quasi-static schedule. This is achieved by a rulebased approach, which avoids an explicit enumeration of the state space. A formal proof of the correctness of the presented clustering approach is given. Experimental results show significant improvements in both, performance and code size, compared to a state-of-the-art clustering algorithm. © 2013 ACM.}, address = {New York, NY}, author = {Falk, Joachim and Zebelein, Christian and Haubelt, Christian and Teich, Jürgen}, booktitle = {ACM Transactions on Embedded Computing Systems}, doi = {10.1145/2442116.2442124}, faupublication = {yes}, keywords = {Actor-oriented design; Clustering; Data flow analysis; Scheduling}, note = {UnivIS-Import:2019-03-25:Pub.2013.tech.IMMD.inform.aruleb}, pages = {74:1-74:31}, peerreviewed = {Yes}, publisher = {ACM}, title = {{A} {Rule}-{Based} {Quasi}-{Static} {Scheduling} {Approach} for {Static} {Islands} in {Dynamic} {Dataflow} {Graphs}}, volume = {12(3)}, year = {2013} } @article{faucris.117074804, abstract = {In this article, an efficient rule-based clustering algorithm for static dataflow subgraphs in a dynamic dataflow graph is presented. The clustered static dataflow actors are quasi-statically scheduled, in such a way that the global performance in terms of latency and throughput is improved compared to a dynamically scheduled execution, while avoiding the introduction of deadlocks as generated by naive static scheduling approaches. The presented clustering algorithm outperforms previously published approaches by a faster computation and more compact representation of the derived quasi-static schedule. This is achieved by a rulebased approach, which avoids an explicit enumeration of the state space. A formal proof of the correctness of the presented clustering approach is given. Experimental results show significant improvements in both, performance and code size, compared to a state-of-the-art clustering algorithm. © 2013 ACM.}, author = {Falk, Joachim and Zebelein, Christian and Haubelt, Christian and Teich, Jürgen}, doi = {10.1145/2442116.2442124}, faupublication = {yes}, journal = {ACM Transactions on Embedded Computing Systems}, keywords = {Actor-oriented design; Clustering; Data flow analysis; Scheduling}, peerreviewed = {Yes}, title = {{A} rule-based quasi-static scheduling approach for static islands in dynamic dataflow graphs}, volume = {12}, year = {2013} } @inproceedings{faucris.122020624, abstract = {In this paper, an efficient embedded software synthesis approach based on a generalized clustering algorithm for static dataflow subgraphs embedded in general dataflow graphs is proposed. The clustered subgraph is quasi-statically scheduled, thus improving performance of the synthesized software in terms of latency and throughput compared to a dynamically scheduled execution. The proposed clustering algorithm outperforms previous approaches by a faster computation and a more compact representation of the derived quasi-static schedules. This is achieved by a rule-based approach, which avoids an explicit enumeration of the state space. Experimental results show significant improvements in both performance and code size when compared to a state-of-the-art clustering algorithm. © 2011 EDAA.}, author = {Falk, Joachim and Zebelein, Christian and Haubelt, Christian and Teich, Jürgen}, booktitle = {Proceedings of Design, Automation and Test in Europe (DATE'11)}, date = {2011-03-14/2011-03-18}, faupublication = {yes}, isbn = {9783981080179}, keywords = {Actor-Oriented Design; MPSoC Scheduling; Software Synthesis}, pages = {521-526}, title = {{A} rule-based static dataflow clustering algorithm for efficient embedded software synthesis}, url = {https://www.scopus.com/inward/record.url?partnerID=HzOxMe3b&scp=79957549287&origin=inward}, venue = {Grenoble}, year = {2011} } @inproceedings{faucris.236619978, abstract = {With approaching exascale performance, applications in the domain of High-Performance Computing (HPC) have to scale to an ever-increasing amount of compute nodes. The Global Address Space Programming Interface (GASPI) communication API promises to handle this challenge by providing a highly flexible and efficient programming model in a Partitioned Global Address Space (PGAS).

Suitable applications targeting supercomputers include the domain of mesh-based solvers for Partial Differential Equations (PDEs) due to their high computational intensity. The implementation of such solvers is highly interdisciplinary, which therefore requires an abstraction of hardware-specific parallelization techniques from developing numerical algorithms.

We present an open-source Run-Time System (RTS) that distributes and parallelizes device-agnostic kernels, which define algorithms on unstructured grids. We describe how the RTS abstracts common parts of iterative solvers and further explain how to parallelize and distribute these components. We further show the efficiency of our approach for several microbenchmarks and an implementation of the Discontinuous Galerkin Method (DGM). The results show that we can almost completely hide all synchronization overhead and that the RTS only imposes a small computational cos}, author = {Groth, Stefan and Grünewald, Daniel and Teich, Jürgen and Hannig, Frank}, booktitle = {Proceedings of the 17th ACM International Conference on Computing Frontiers (CF)}, date = {2020-05-11/2020-05-13}, doi = {10.1145/3387902.3392628}, faupublication = {yes}, isbn = {978-1-4503-7956-4/20/05}, keywords = {Runtime System; Finite Element Methods; PGAS; GASPI}, pages = {39-48}, peerreviewed = {Yes}, publisher = {ACM}, title = {{A} {Runtime} {System} for {Finite} {Element} {Methods} in a {Partitioned} {Global} {Address} {Space}}, venue = {Catania, Sicily, Italy}, year = {2020} } @inproceedings{faucris.251887229, author = {Plagwitz, Patrick and Hannig, Frank and Ströbel, Martin and Strohmeyer, Christoph and Teich, Jürgen}, booktitle = {Proceedings of the 29th IEEE International Symposium on Field-Programmable Custom Computing Machines (FCCM)}, date = {2021-05-09/2021-05-12}, doi = {10.1109/FCCM51124.2021.00010}, faupublication = {yes}, peerreviewed = {Yes}, publisher = {IEEE}, title = {{A} {Safari} through {FPGA}-based {Neural} {Network} {Compilation} and {Design} {Automation} {Flows}}, url = {https://ieeexplore.ieee.org/document/9444092}, venue = {Virtual Conference}, year = {2021} } @article{faucris.122860144, abstract = {Many problems in computational science and engineering involve partial differential equations and thus require the numerical solution of large, sparse (non)linear systems of equations. Multigrid is known to be one of the most efficient methods for this purpose. However, the concrete multigrid algorithm and its implementation highly depend on the underlying problem and hardware. Therefore, changes in the code or many different variants are necessary to cover all relevant cases. In this article we provide a prototype implementation in Scala for a framework that allows abstract descriptions of PDEs, their discretization, and their numerical solution via multigrid algorithms. From these, one is able to generate data structures and implementations of multigrid components required to solve elliptic PDEs on structured grids. Two different test problems showcase our proposed automatic generation of multigrid solvers for both CPU and GPU target platform}, author = {Köstler, Harald and Schmitt, Christian and Kuckuk, Sebastian and Kronawitter, Stefan and Hannig, Frank and Teich, Jürgen and Rüde, Ulrich and Lengauer, Christian}, doi = {10.1504/IJCSE.2017.10003829}, faupublication = {yes}, journal = {International Journal of Computational Science and Engineering}, pages = {150-163}, peerreviewed = {Yes}, title = {{A} {Scala} {Prototype} to {Generate} {Multigrid} {Solver} {Implementations} for {Different} {Problems} and {Target} {Multi}-{Core} {Platforms}}, volume = {14}, year = {2017} } @article{faucris.320897331, author = {Spieck, Jan and Wildermann, Stefan and Teich, Jürgen}, doi = {10.1145/3660633}, faupublication = {yes}, journal = {ACM Transactions on Design Automation of Electronic Systems}, peerreviewed = {Yes}, title = {{A} {Scenario}-{Based} {DVFS}-{Aware} {Hybrid} {Application} {Mapping} {Methodology} for {MPSoCs}}, year = {2024} } @inproceedings{faucris.116098884, abstract = {In this paper, we propose a self-adaptive FPGAbased, partially reconfigurable system for space missions in order to mitigate Single Event Upsets in the FPGA configuration and fabric. Dynamic reconfiguration is used here for an on-demand replication of modules in dependence of current and changing radiation levels. More precisely, the idea is to trigger a redundancy scheme such as Dual Modular Redundancy or Triple Modular Redundancy in response to a continuously monitored Single Event Upset rate measured inside the on-chip memories itself, e.g., any subset (even used) internal Block RAMs. Depending on the current radiation level, the minimal number of replicas is determined at runtime under the constraint that a required Safety Integrity Level for a module is ensured and configured accordingly. For signal processing applications it is shown that this autonomous adaption to the different solar conditions realizes a resource efficient mitigation. In our case study, we show that it is possible to triplicate the data throughput at the Solar Maximum condition (no flares) compared to a Triple Modular Redundancy implementation of a single module. We also show the decreasing Probability of Failures Per Hour by 2 × 10 at flare-enhanced conditions compared with a non-redundant system. Our work is a part of the In-Orbit Verification of the Heinrich Hertz communication satellite.}, author = {Glein, Robert and Schmidt, Bernhard and Rittner, Florian and Teich, Jürgen and Ziener, Daniel}, booktitle = {Proceedings of Field-Programmable Custom Computing Machines (FCCM 2014)}, date = {2014-05-11/2014-05-13}, doi = {10.1109/FCCM.2014.79}, faupublication = {yes}, isbn = {9781479951116}, pages = {251-258}, publisher = {Institute of Electrical and Electronics Engineers Inc.}, title = {{A} self-adaptive {SEU} mitigation system for {FPGAs} with an internal block {RAM} radiation particle sensor}, venue = {Boston}, year = {2014} } @inproceedings{faucris.112677664, abstract = {Due to the raising complexity in distributed embedded systems, a single designer will not be able to plan and organize the communication for such systems. Therefore, it will get more and more important to relieve the designer in that task. Our idea is a communication system that is capable to organize itself to satisfy predefined properties. In this paper, we want to solve the problem of establishing fair bandwidth sharing on priority-based buses by using simple local rules on the distributed system to avoid a single point of failure and cope with online system changes. Based on a game theoretical analysis, a multi-agent reinforcement learning algorithm is proposed that establishes fair bandwidth distribution. The main idea is to penalize nodes that claim too much bandwidth by the other nodes. We experimentally evaluated the algorithm with different parameter settings. The algorithm showed to converge to a fair solution in any experiment. This means the system is able to completely self-organize without global information for our assumptions. In addition, we could figure out that we can configure a trade-off between convergence speed and computation effort. We hope this is a small first step towards totally self-organizing real-time systems. © 2010 IEEE.}, author = {Ziermann, Tobias and Mühleis, Nina and Wildermann, Stefan and Teich, Jürgen}, booktitle = {Proceedings of the 1st IEEE Workshop on Self-Organizing Real-Time systems (SORT 2010)}, date = {2010-05-11/2010-05-11}, doi = {10.1109/ISORCW.2010.18}, faupublication = {yes}, isbn = {9780769540382}, keywords = {Bus-based communication; Multi-agent reinforcement learning; Self-organizing}, pages = {11-20}, peerreviewed = {unknown}, title = {{A} self-organizing distributed reinforcement learning algorithm to achieve fair bandwidth allocation for priority-based bus communication}, venue = {Carmona, Sevilla}, volume = {2}, year = {2010} } @inproceedings{faucris.117076564, abstract = {As a result of the increased demand for bandwidth, current automotive networks are getting more heterogeneous. New technologies like Ethernet as a packet-switched point-to-point network are introduced. Nevertheless, the requirements on stand-by power consumption and short activation times are still the same as for existing field buses. Ethernet does not provide wakeup mechanisms that are sufficient for automotive systems. As a remedy, this paper introduces a novel physical-layer mechanism called Low Frequency Wakeup that is largely independent of the communication technology and topology used. It provides parallel and remote wakeup for all nodes even in a point-to-point network as well as full support of partial networking. The overall wakeup detection time is smaller than 10ms and every node can actively feed a wakeup signal asynchronously to all other nodes. In terms of latency, it is shown that Low Frequency Wakeup reaches a reduction of more than 30 % for a three-hop network and more than 50 % for a five-hop network in comparison to the current state-of-the-art technology for automotive point-to-point networks. © 2014 EDAA.}, author = {Seyler, Jan R. and Streichert, Thilo and Warkentin, Juri and Spägele, Matthias and Glaß, Michael and Teich, Jürgen}, booktitle = {Proceedings of Design, Automation and Test in Europe (DATE 2014)}, date = {2014-03-24/2014-03-28}, doi = {10.7873/DATE2014.019}, faupublication = {yes}, isbn = {9783981537024}, pages = {6}, peerreviewed = {unknown}, publisher = {Institute of Electrical and Electronics Engineers Inc.}, title = {{A} self-propagating wakeup mechanism for point-to-point networks with partial network support}, venue = {Dresden}, year = {2014} } @inproceedings{faucris.118194604, abstract = {Online adaptation is a key requirement for image processing applications when used in dynamic environments. In contrast to batch learning, where retraining is required each time a new observation occurs, sequential learning algorithms offer the ability to iteratively adapt the existing classifier. In this paper, we present a neural network architecture and a fast online learning algorithm that allow to use the class of resource allocation networks for such adaptive image processing applications. The network is based on receptive fields that are processed by RBF sub-nets. The learning algorithm builds such networks online by adding new units to the sub-nets each time novel input data is observed. For this, we define a global and a local novelty criterion. Experimental results show that the proposed network outperforms existing RAN algorithms when used for face detection and recognition and is competitive with existing classifiers. © 2008 IEEE.}, address = {New York}, author = {Teich, Jürgen and Wildermann, Stefan}, booktitle = {Proceedings of the 8th International Conference on Hybrid Intelligent Systems}, date = {2008-09-10/2008-09-12}, doi = {10.1109/HIS.2008.101}, faupublication = {yes}, note = {UnivIS-Import:2015-04-16:Pub.2008.tech.IMMD.inform.aseque}, pages = {132-137}, publisher = {IEEE Press}, title = {{A} {Sequential} {Learning} {Resource} {Allocation} {Network} for {Image} {Processing} {Applications}}, venue = {Barcelona}, year = {2008} } @article{faucris.122986204, abstract = {Digital signal processing algorithms are of big importance in many embedded systems. Due to complexity reasons and due to the restrictions imposed on the implementations, new design methodologies are needed. In this paper, we present a SystemC-based solution supporting automatic design space exploration, automatic performance evaluation, as well as automatic system generation for mixed hardware/software solutions mapped onto FPGA-based platforms. Our proposed hardware/software codesign approach is based on a SystemC-based library called SysteMoC that permits the expression of different models of computation well known in the domain of digital signal processing. It combines the advantages of executability and analyzability of many important models of computation that can be expressed in SysteMoC. We will use the example of an MPEG-4 decoder throughout this paper to introduce our novel methodology. Results from a five-dimensional design space exploration and from automatically mapping parts of the MPEG-4 decoder onto a Xilinx FPGA platform will demonstrate the effectiveness of our approach.}, author = {Haubelt, Christian and Falk, Joachim and Keinert, Joachim and Schlichter, Thomas and Streubühr, Martin and Deyhle, Andreas and Hadert, Andreas and Teich, Jürgen}, doi = {10.1155/2007/47580}, faupublication = {yes}, journal = {EURASIP Journal on Embedded Systems}, note = {UnivIS-Import:2015-04-14:Pub.2007.tech.IMMD.inform.asyste}, pages = {Article ID 47580, 22 pages}, peerreviewed = {unknown}, title = {{A} {SystemC}-based {Design} {Methodology} for {Digital} {Signal} {Processing} {Systems}}, volume = {2007}, year = {2007} } @inproceedings{faucris.119368744, abstract = {There is trend towards networked and distributed hardware reconfigurable systems, complicating the design process at the system-level. This paper will provide a solution to the problem of design space exploration for such embedded systems of the next generation. We will show the problems occurring while exploring the design space at the system-level, leading to new properties for valid implementations. The novelty of this approach lies in the support of explicit communication modeling and time-multiplexed architecture modeling in a single model. The proposed design space exploration is based on Evolutionary Algorithms and a new slack-based list scheduler. © 2005 IEEE.}, author = {Haubelt, Christian and Otto, Stephan and Grabbe, Cornelia and Teich, Jürgen}, booktitle = {Proceedings of Asia and South Pacific Design Automation Conference (ASP-DAC'05)}, date = {2005-01-18/2005-01-21}, faupublication = {yes}, isbn = {9780780387362}, pages = {298-301}, peerreviewed = {unknown}, title = {{A} system-level approach to hardware reconfigurable systems}, url = {https://www.scopus.com/inward/record.url?partnerID=HzOxMe3b&scp=84861440285&origin=inward}, venue = {Shanghai}, volume = {1}, year = {2005} } @inproceedings{faucris.109504164, abstract = {System-level synthesis is the task of automatically implementing application models as hardware/software systems. It encompasses four basic sub tasks, namely decision making and refinement for both computation and communication. In the past, several system-level synthesis approaches have been proposed. However, it was shown that each of these approaches has drawbacks in at least one of the four sub tasks. In this paper, we present our efforts towards a comprehensive system-level synthesis by combining two academic system-level solutions into a seamless approach that automatically generates pin-accurate implementation-level models starting from a formal application model and generic MPSoC architecture templates. We analyze the system-level synthesis flow and define intermediate representations in terms of transaction level models that serve as link between existing tools. Furthermore, we present the automated transformation between models for combining two design flows. We demonstrate the combined flow on an industrial-strength example and show the benefits of fully automatic exploration and synthesis for rapid and early system-level design. ©2010 IEEE.}, author = {Gladigau, Jens and Gerstlauer, Andreas and Haubelt, Christian and Streubühr, Martin and Teich, Jürgen}, booktitle = {Proceedings of the International Conference on Embedded Computer Systems: Architectures, Modeling and Simulation (SAMOS)}, date = {2010-07-19/2010-07-22}, doi = {10.1109/ICSAMOS.2010.5642076}, faupublication = {yes}, isbn = {9781424479382}, pages = {118-125}, peerreviewed = {unknown}, title = {{A} system-level synthesis approach from formal application models to generic bus-based {MPSoCs}}, venue = {Samos}, year = {2010} } @inproceedings{faucris.106328464, abstract = {

Today, facilities used for scientific computing are highly parallel and becoming more and more heterogeneous. This trend can be easily seen in the TOP500 list, where an increasing number of systems is equipped with accelerators, such as GPUs or many-cores. To achieve the best performance on such machines, special tweaking of the code is necessary, which takes time and expert knowledge of the hardware and corresponding optimization techniques. Domain-specific languages (DSLs) are a remedy to this dilemma by separating the algorithm specification from its implementation, leaving room for optimizations to be applied automatically by the DSL compiler. Thus, the compiler needs to have a profound knowledge of the target platform, e.g., available accelerators and how to program them, details of the network topology to optimize communication patterns, as well as CPU specifications for cache optimizations and vectorization. In this paper, we introduce our approach to modeling hardware and software information to provide platform details that our code generator requires to optimize and emit code for the solution of partial differential equations using the geometric multigrid method.

}, address = {Berlin}, author = {Schmitt, Christian and Hannig, Frank and Teich, Jürgen}, booktitle = {Workshop Proceedings of the 31st GI/ITG International Conference on Architecture of Computing Systems (ARCS)}, date = {2018-04-09/2018-04-12}, faupublication = {yes}, isbn = {978-3-8007-4559-3}, pages = {59-66}, peerreviewed = {Yes}, publisher = {VDE VERLAG GmbH}, title = {{A} {Target} {Platform} {Description} {Language} for {Parallel} {Code} {Generation}}, url = {https://www12.cs.fau.de/downloads/schmittch/publications/SHT18arcs.pdf}, venue = {Braunschweig}, year = {2018} } @inproceedings{faucris.122566884, author = {Teich, Jürgen and Thiele, Lothar}, booktitle = {Proc. Int. Conf. on Application Specific Array Processors}, faupublication = {no}, note = {UnivIS-Import:2015-04-16:Pub.1992.tech.IMMD.inform.atrans}, pages = {4-20}, title = {{A} transformative approach to the partitioning of processor arrays}, venue = {Berkeley, CA, U.S.A.,}, year = {1992} } @inproceedings{faucris.111065284, author = {Teich, Jürgen and Hannig, Frank and Ruckdeschel, Holger and Dutta, Hritam and Kissler, Dmitrij and Stravet, Andrej}, booktitle = {Proceedings of the International Conference on Engineering of Reconfigurable Systems and Algorithms (ERSA)}, date = {2007-06-25/2007-06-28}, faupublication = {yes}, pages = {14-24}, peerreviewed = {unknown}, title = {{A} {Unified} {Retargetable} {Design} {Methodology} for {Dedicated} and {Re}-{Programmable} {Multiprocessor} {Arrays}: {Case} {Study} and {Quantitative} {Evaluation}}, venue = {Las Vegas, NV}, year = {2007} } @inproceedings{faucris.118704564, abstract = {Networks-on-Chip have shown their scalability for future many-core systems on chip. In real world scenarios, where multiple applications are being executed over a shared NoC based platform, efficient utilization of Networks-on-Chip resources becomes challenging. Methodologies are required to ensure better utilization of NoC, especially in the scenarios, where the communication patterns of NoC traffic are difficult to predict before run-time. In this paper, we propose a self-optimization mechanism which detects frequent communication by monitoring communication patterns at run-time and uses this information to establish virtual connections autonomously. Communication monitoring and connection establishment are realized in hardware. Hardware managed virtual connections lead to better utilization of NoC resources and reduce the communication latencies suffered by applications. In addition, energy consumption by the communication infrastructure is reduced. The proposed concept is investigated through simulation of real world application scenarios. The simulation results highlight the performance improvement and synthesis results show the low area overhead of the proposed hardware implementation. © 2013 IEEE.}, author = {Zaib, Aurang and Heisswolf, Jan and Weichslgartner, Andreas and Wild, Thomas and Teich, Jürgen and Becker, Jürgen and Herkersdorf, Andreas}, booktitle = {Proc. 16th Euromicro Conference on Digital System Design}, date = {2013-09-04/2013-09-06}, doi = {10.1109/DSD.2013.87}, faupublication = {yes}, note = {UnivIS-Import:2015-04-16:Pub.2013.tech.IMMD.inform.autogs}, pages = {761-768}, series = {DSD '13}, title = {{AUTO}-{GS}: {Self}-optimization of {NoC} {Traffic} {Through} {Hardware} {Managed} {Virtual} {Connections}}, venue = {Cantabria}, year = {2013} } @inproceedings{faucris.117953924, author = {Evans, Brian L. and Schwarz, Christian and Teich, Jürgen}, booktitle = {Proc. IEEE Asilomar Conf. on Signals, Systems and Computers}, faupublication = {no}, note = {UnivIS-Import:2015-04-16:Pub.1994.tech.IMMD.inform.automa}, pages = {363-367}, title = {{Automateddesign} of two-dimensional rational decimation systems}, venue = {Pacific Grove, CA, U.S.A.,}, year = {1994} } @inproceedings{faucris.117991104, author = {Bednara, Marcus and Hardt, Wolfram and Rettberg, Achim and Teich, Jürgen}, booktitle = {Proc. Ninth Annual International HDL Conference and Exhibition}, faupublication = {no}, note = {UnivIS-Import:2015-04-16:Pub.2000.tech.IMMD.inform.automa}, pages = {???}, title = {{Automated} {Design} {Space} {Exploration} on {System} {Level} for {Embedded} {Systems}}, venue = {San Jose, CA}, year = {2000} } @inproceedings{faucris.117450564, author = {Hannig, Frank and Kupriyanov, Olexiy and Teich, Jürgen}, booktitle = {Proceedings of the Workshop on Compilers and Tools for Constrained Embedded Systems (CTCES 2004)}, faupublication = {yes}, peerreviewed = {unknown}, title = {{Automatic} and {Optimized} {Generation} of {Compiled} {High}-{Speed} {RTL} {Simulators}}, venue = {Washington, DC}, year = {2004} } @article{faucris.122644984, abstract = {This work presents a communication-driven virtual prototyping approach integrated in an existing ESL design methodology to automatically synthesize, evaluate, and optimize a data-flow application for mixed hardware/software and even networked MPSoCs. While existing synthesis tools are suitable for individual subsystems (e.g., software tasks for CPUs, hardware accelerators), the problem of establishing the communication between different subsystems that may even be simulated at different levels of abstraction is still challenging. As a remedy, we introduce the concept of bridge components in our architecture model that, during virtual prototyping, serve as integrators between subsystems that may have different communication protocols and be simulated at different levels of abstraction (e.g., TLM, behavioral level, RTL). We propose to consider bridges throughout the complete ESL design flow: Already during Design Space Exploration (DSE), the characteristics of bridge components such as implementation cost and additional latency on the application can be taken into account. Moreover, we extend the exploration model of the DSE to include required communication-related design decisions, i.e., the mapping of binary code for software tasks and the selection of different synchronization patterns for the communication. For virtual prototyping of implementation candidates derived by the DSE, the bridge components enable to automatically disassemble the system into subsystems and hand each subsystem over to an individual synthesis tool. When integrating the subsystems together, our methodology also synthesizes the interfaces for all bridges which significantly simplifies system integration. As a proof of concept, we present (I) a distributed control application that is transformed into a virtual prototype consisting of six subsystems and (II) a data-flow application from the video processing domain transformed into a virtual prototype consisting of three subsystems. The resulting subsystems can be concurrently simulated at TLM, behavioral level, and RTL. The experiments give evidence of the proposed technique's applicability, the achieved productivity gain, and the resulting simulation performance at the considered levels of abstractio}, author = {Falk, Joachim and Schwarzer, Tobias and Zhang, Liyuan and Glaß, Michael and Teich, Jürgen}, doi = {10.1016/j.micpro.2015.08.008}, faupublication = {yes}, journal = {Microprocessors and Microsystems}, keywords = {Communication refinement; ESL design flow; SystemC-TLM; Virtual prototyping}, pages = {1012–1028}, peerreviewed = {Yes}, title = {{Automatic} communication-driven virtual prototyping and design for networked embedded systems}, volume = {39}, year = {2015} } @inproceedings{faucris.123588124, abstract = {

In different domains, Simulink has gained a lot of acceptance because it is a robust tool for rapid design and simulation of control systems. However, it is not supported by Simulink. In the literature, there are some of the most popular translation tools in the world. C or C ++ code from Simulink models, which is used for implementation or validation purposes. (ESL) Design methodologies and tools like SystemCoDesigner. (DFGs) from a given Simulink model. (ESD) and design code for hardware / software parts directly from the ESL model. As a validation step, we found the results of the simulation in Simulink and the results of the DFG in SysteMoC for a signal processing case study.

}, author = {Letras, Martin and Falk, Joachim and Wildermann, Stefan and Teich, Jürgen}, booktitle = {20th International Workshop on Software and Compilers for Embedded Systems (SCOPES)}, date = {2017-06-12/2017-06-13}, doi = {10.1145/3078659.3078668}, faupublication = {yes}, isbn = {978-1-4503-5039-6/17/06}, keywords = {Data Flow Graph; Simulink; SyteMoC; Code Generation}, peerreviewed = {unknown}, title = {{Automatic} {Conversion} of {Simulink} {Models} to {SysteMoC} {Actor} {Networks}}, venue = {Sankt Goar}, year = {2017} } @inproceedings{faucris.120741544, abstract = {This paper presents a new tool for the automatic generation of highly parallelized Finite Impulse Response (FIR) filters. In this approach we follow our PARO design methodology. PARO is a design system project for modeling, transformation, optimization, and synthesis of massively parallel VLSI architectures. The FIR filter generator employs during the design flow the following advanced transformations, (a) hierarchical partitioning in order to balance the amount of local memory with external communication, and (b), partial localization to achieve higher throughput and smaller latencies. Furthermore, our filter generator allows for design space exploration to tackle trade-offs in cost and speed. Finally, synthesizable VHDL code is generated and mapped to an FPGA, the results are compared with a commercial filter generator. © Springer-Verlag Berlin Heidelberg 2005.}, address = {Berlin, Heidelberg, New York}, author = {Dutta, Hritam and Hannig, Frank and Teich, Jürgen and et al.}, author_hint = {Dutta Hritam, Hannig Frank, Ruckdeschel Holger, Teich Jürgen}, booktitle = {In Proceedings of the 5th International Workshop on Embedded Computer Systems, Architectures, Modeling, and Simulation (SAMOS 2005)}, date = {2005-07-18/2005-07-20}, faupublication = {yes}, isbn = {3-540-26969-X}, note = {UnivIS-Import:2015-04-16:Pub.2005.tech.IMMD.inform.automa{\_}0}, pages = {51-61}, publisher = {Springer-verlag}, series = {Lecture Notes in Computer Science (LNCS)}, support_note = {Author relations incomplete. You may find additional data in field 'author{\_}hint'}, title = {{Automatic} {FIR} {Filter} {Generation} for {FPGAs}}, venue = {Island of Samos}, volume = {3553}, year = {2005} } @inproceedings{faucris.117078544, abstract = {Virtual prototyping is a more and more accepted technology to enable early software development in the design flow of embedded systems. Since virtual prototypes are typically constructed manually, their value during design space exploration is limited. On the other hand, system synthesis approaches often start from abstract and executable models, allowing for fast design space exploration, considering only predefined design decisions. Usually, the output of these approaches is an ad hoc implementation, which is hard to reuse in further refinement steps. In this paper, we propose a methodology for automatic generation of heterogeneous MPSoC virtual prototypes starting with models for streaming applications. The advantage of the proposed approach lies in the fact that it is open to subsequent design steps. The applicability of the proposed approach to real-world applications is demonstrated using a Motion JPEG decoder application that is automatically refined into several virtual prototypes within seconds, which are correct by construction, instead of using error-prone manual refinement, which typically requires several days. © 2011 IEEE.}, author = {Kutzer, Philip and Gladigau, Jens and Haubelt, Christian and Teich, Jürgen}, booktitle = {Proceedings of the 22nd IEEE International Symposium on Rapid System Prototyping}, doi = {10.1109/RSP.2011.5929986}, faupublication = {yes}, isbn = {9781457706585}, pages = {128-134}, peerreviewed = {unknown}, title = {{Automatic} generation of system-level virtual prototypes from streaming application models}, venue = {Karlsruhe}, year = {2011} } @inproceedings{faucris.118770124, abstract = {This paper presents a graph-based representation of success trees to evaluate the reliability of an embedded system. First, a success tree is constructed by deriving a characteristic function from a graph-based system model automatically. The constructed success tree is then translated to a graph (called a success graph) which supports both cyclic and acyclic data dependencies in applications to be mapped to the system resources and analyzed for reliability. To analyze the success graph, an algorithm called 0-propagation is introduced which propagates errors through the graph. The system fails if the errors propagate to the output of the success graph. Experimental results show that the proposed technique can simply and efficiently construct and analyze success trees of real-life embedded systems in a short time with negligible inaccuracy which suits well for evaluating the reliability of complex systems, e. g., as part of a design space exploration. © 2014 IEEE.}, author = {Aliee, Hananeh and Glaß, Michael and Wanka, Rolf and Teich, Jürgen}, booktitle = {Proc. 60th Annual Reliability and Maintainability Symposium (RAMS)}, date = {2014-01-27/2014-01-30}, doi = {10.1109/RAMS.2014.6798487}, faupublication = {yes}, keywords = {design space exploration; fault tree; permanent fault; stochastic logic; Success tree; transient fault}, note = {UnivIS-Import:2015-04-16:Pub.2014.tech.IMMD.infalg.{\_}autom}, pages = {563-569}, title = {{Automatic} {Graph}-based {Success} {Tree} {Construction} and {Analysis}}, url = {https://www12.informatik.uni-erlangen.de/people/rwanka/publications/AGWT14.php}, venue = {Colorado Springs, Colorado, USA}, year = {2014} } @inproceedings{faucris.106810044, abstract = {

Programming image processing algorithms on hardware accelerators such as graphics processing units (GPUs) often exhibits a trade-off between software portability and performance portability. Domain-specific languages (DSLs) have proven to be a promising remedy, which enable optimizations and generation of efficient code from a concise, high-level algorithm representation. The scope of this paper is an optimization framework for image processing DSLs in the form of a source-to-source compiler. To cope with the inter-kernel communication bound via global memory for GPU applications, kernel fusion is investigated as a primary optimization technique to improve temporal locality. In order to enable automatic kernel fusion, we analyze the fusibility of each kernel in the algorithm, in terms of data dependencies, resource utilization, and parallelism granularity. By combining the obtained information with the domain-specific knowledge captured in the DSL, a method to automatically fuse the suitable kernels is proposed and integrated into an open source DSL framework. The novel kernel fusion technique is evaluated on two filter-based image processing applications, for which speedups of up to 1.60 are obtained for an NVIDIA Geforce 745 graphics card target.

}, author = {Qiao, Bo and Reiche, Oliver and Hannig, Frank and Teich, Jürgen}, booktitle = {Proceedings of the 21th International Workshop on Software and Compilers for Embedded Systems (SCOPES)}, date = {2018-05-28/2018-05-30}, doi = {10.1145/3207719.3207723}, faupublication = {yes}, isbn = {978-1-4503-5780-7}, keywords = {Domain-specific Languages; Image Processing; Kernel Fusion}, pages = {76-85}, peerreviewed = {Yes}, title = {{Automatic} {Kernel} {Fusion} for {Image} {Processing} {DSLs}}, venue = {Sankt Goar}, year = {2018} } @inproceedings{faucris.124190484, author = {Pourmohseni, Behnaz and Glaß, Michael and Teich, Jürgen}, booktitle = {Proceedings of the 20th Design, Automation & Test in Europe Conference & Exhibition (DATE)}, date = {2017-03-27/2017-03-31}, doi = {10.23919/DATE.2017.7927160}, faupublication = {yes}, keywords = {Operating point distillation, hybrid application mapping, design space exploration, run-time management, heterogeneous many-core systems}, pages = {1135-1140}, peerreviewed = {Yes}, title = {{Automatic} {Operating} {Point} {Distillation} for {Hybrid} {Mapping} {Methodologies}}, venue = {Lausanne}, year = {2017} } @inproceedings{faucris.118783104, abstract = {In the domain of image processing, often real-time constraints are required. In particular, in safety-critical applications, such as X-ray computed tomography in medical imaging or advanced driver assistance systems in the automotive domain, timing is of utmost importance. A common approach to maintain real-time capabilities of compute-intensive applications is to offload those computations to dedicated accelerator hardware, such as Field Programmable Gate Arrays (FPGAs). Programming such architectures is a challenging task, with respect to the typical FPGA-specific design criteria: Achievable overall algorithm latency and resource usage of FPGA primitives (BRAM, FF, LUT, and DSP). High-Level Synthesis (HLS) dramatically simplifies this task by enabling the description of algorithms in well-known higher languages (C/C++) and its automatic synthesis that can be accomplished by HLS tools. However, algorithm developers still need expert knowledge about the target architecture, in order to achieve satisfying results. Therefore, in previous work, we have shown that elevating the description of image algorithms to an even higher abstraction level, by using a Domain-Specific Language (DSL), can significantly cut down the complexity for designing such algorithms for FPGAs. To give the developer even more control over the common trade-off, latency vs. resource usage, we will present an automatic optimization process where these criteria are analyzed and fed back to the DSL compiler, in order to generate code that is closer to the desired design specifications. Finally, we generate code for stereo block matching algorithms and compare it with hand}, author = {Reiche, Oliver and Häublein, Konrad and Reichenbach, Marc and Hannig, Frank and Teich, Jürgen and Fey, Dietmar}, booktitle = {Proceedings of the DATE Friday Workshop on Heterogeneous Architectures and Design Methods for Embedded Image Systems (HIS 2015)}, date = {2015-03-13/2015-03-13}, faupublication = {yes}, keywords = {GRK-1773}, note = {UnivIS-Import:2015-04-17:Pub.2015.tech.IMMD.IMMD3.{\_}autom}, pages = {10-15}, peerreviewed = {Yes}, title = {{Automatic} {Optimization} of {Hardware} {Accelerators} for {Image} {Processing}}, url = {http://arxiv.org/abs/1502.07448}, venue = {Grenoble}, year = {2015} } @inproceedings{faucris.118570584, abstract = {An efficient memory bandwidth utilization for GPU accelerators is crucial for memory bound applications. In medical imaging, the performance of many kernels is limited by the available memory bandwidth since only a few operations are performed per pixel. For such kernels only a fraction of the compute power provided by GPU accelerators can be exploited and performance is predetermined by memory bandwidth. As a remedy, this paper investigates the optimal utilization of available memory bandwidth by means of increasing in-flight memory transactions. Instead of doing this manually for different GPU accelerators, the required CUDA and OpenCL code is automatically generated from descriptions in a Domain-Specific Language (DSL) for the considered application domain. Moreover, the DSL is extended to also support global reduction operators. We show that the generated target-specific code improves bandwidth utilization for memory-bound kernels significantly. Moreover, competitive performance compared to the GPU back end of the widely used image processing library OpenCV can be achieved. © 2012 IEEE.}, address = {New York, NY, USA}, author = {Membarth, Richard and Hannig, Frank and Teich, Jürgen and Körner, Mario and Eckert, Wieland}, booktitle = {Proc. of the 11th International Symposium on Parallel and Distributed Computing (ISPDC)}, date = {2012-06-25/2012-06-29}, doi = {10.1109/ISPDC.2012.36}, faupublication = {yes}, isbn = {978-1-4673-2599-8}, keywords = {code generation; CUDA; domain-specific language; global operators; GPU; medical imaging; memory bandwidth utilization; OpenCL; reductions}, note = {UnivIS-Import:2015-04-16:Pub.2012.tech.IMMD.inform.automa}, pages = {211-218}, publisher = {IEEE Press}, title = {{Automatic} {Optimization} of {In}-{Flight} {Memory} {Transactions} for {GPU} {Accelerators} based on a {Domain}-{Specific} {Language} for {Medical} {Imaging}}, venue = {Munich}, year = {2012} } @inproceedings{faucris.120088364, abstract = {To cope with the strict reliability requirements of safety-critical
ADAS applications, the upcoming TSN standard introduces mechanisms
that enable transmission redundancy at any switch or end
node. However, it is up to the designer to decide at which points
and for which messages to activate transmission redundancy. This
significantly increases the design space and requires to trade-off
reliability with other routing-related design objectives like network
load, transmission timing, or the monetary cost of the hardware. As
a remedy, this paper a) presents two different exact approaches to
generate feasible redundant message routings and b) proposes an
extension of the state-of-the-art approach for the multi-objective
routing optimization, enabling the optimizer to directly adjust system
features that are relevant for the design objectives. A case study
with an application from the automotive domain compares the optimization
capabilities of the presented approaches for the routing
generation and demonstrates the significant gain in optimization
power that is achieved with the proposed optimization extension.}, author = {Smirnov, Fedor and Reimann, Felix and Teich, Jürgen and Han, Zhao and Glaß, Michael}, booktitle = {Proceedings of 21st International Workshop on Software and Compilers for Embedded Systems (SCOPES 2018)}, date = {2018-05-28/2018-05-30}, doi = {10.1145/3207719.3207725}, editor = {ACM}, faupublication = {yes}, keywords = {Design Space Exploration; Network Optimization; Automotive Ethernet}, peerreviewed = {unknown}, title = {{Automatic} {Optimization} of {Redundant} {Message} {Routings} in {Automotive} {Networks}}, venue = {Sankt Goar}, year = {2018} } @article{faucris.203900661, abstract = {Dividing the communication network into so-called Virtual Local Area Networks (VLANs), i.e., subnetworks that are isolated at the data link layer (OSI layer 2), is a promising approach to address the increasing security challenges in automotive networks. The automation of the VLAN partitioning is a well-researched problem in the domain of local or metropolitan area networks. However, the approaches used there are hardly applicable for the design of automotive networks as they mainly focus on reducing the amount of broadcast traffic and cannot capture the many design objectives of automotive networks like the message timing or the link load, which are affected by the VLAN partitioning. As a remedy, this article proposes an approach based on a set of Pseudo-Boolean constraints to generate a message routing which is feasible with respect to the VLAN-related routing restrictions in automotive networks. This approach can be used for a design space exploration to optimize not only the VLAN partitioning but also other routing-related objectives. We demonstrate both the efficiency of our message routing approach and the now accessible optimization potential for the complete Electric/Electronic architecture with a mixed-criticality system from the automotive domain. There we thoroughly investigate the impact of the VLAN partitioning on the message timing and the link loads by optimizing these design objectives concurrently. During the exploration of the huge design space, where each resource can be assigned to one of 4 VLANs, our approach requires less than 40 milliseconds for the creation of a valid solution and ensures that all messages satisfy their deadlines and link load bounds.Embedded system applications often require guarantees regarding non-functional properties when executed on a given MPSoC platform. Examples of such requirements include real-time, energy or safety properties on corresponding programs. One option to implement the enforcement of such requirements is by a reactive control loop, where an enforcer decides based on a system response (feedback) how to control the system, e.g., by adapting the number of cores allocated to a program or by scaling the voltage/frequency mode of involved processors. Typically, a violation of a requirement must either never happen in case of strict enforcement, or only happen temporally (in case of so-called loose enforcement). However, it is a challenge to design enforcers for which it is possible to give formal guarantees with respect to requirements, especially in the presence of typically largely varying environmental input (workload) per execution. Technically, an enforcement strategy can be formally modeled by a finite state machine (FSM) and the uncertain environment determining the workload by a discrete-time Markov chain. It has been shown in previous work that this formalization allows the formal verification of temporal properties (verification goals) regarding the fulfillment of requirements for a given enforcement strategy. In this paper, we consider the so far unsolved problem of design space exploration and automatic synthesis of enforcement automata that maximize a number of deterministic and probabilistic verification goals formulated on a given set of non-functional requirements. For the design space exploration (DSE), an approach based on multi-objective evolutionary algorithms is proposed in which enforcement automata are encoded as genes of states and state transition conditions. For each individual, the verification goals are evaluated using probabilistic model checking. At the end, the DSE returns a set of efficient FSMs in terms of probabilities of meeting given requirements. As experimental results, we present three use cases while considering requirements on latency and energy consumption.}, author = {Esper, Khalil and Wildermann, Stefan and Teich, Jürgen}, doi = {10.1145/3617832}, faupublication = {yes}, journal = {ACM Transactions on Design Automation of Electronic Systems}, keywords = {Finite State Machine; Genetic Algorithm; Probabilistic Model Cheking; Design Space Exploration; Verification; Runtime Requirement Enforcement; Optimization; Enforcement FSM; MPSoC; Steady State; Evolutionary Algorithm; PCTL; Markov Chain;}, pages = {1-20}, peerreviewed = {Yes}, title = {{Automatic} {Synthesis} of {FSMs} for {Enforcing} {Non}-{Functional} {Requirements} on {MPSoCs} {Using} {Multi}-{Objective} {Evolutionary} {Algorithms}}, volume = {28}, year = {2023} } @article{faucris.107703464, author = {Gladigau, Jens and Gerstlauer, Andreas and Haubelt, Christian and Streubühr, Martin and Teich, Jürgen}, faupublication = {yes}, journal = {LNCS Transactions on High-Performance Embedded Architectures and Compilers}, pages = {1-22}, peerreviewed = {Yes}, title = {{Automatic} {System}-{Level} {Synthesis}: {From} {Formal} {Application} {Models} to {Generic} {Bus}-{Based} {MPSoCs}}, volume = {5}, year = {2011} } @article{faucris.201476911, abstract = {The purpose of a domain-specific language (DSL) is to enable the application programmer to specify a problem, or an abstract algorithm description, in his/her domain of expertise without being burdened by implementation details. The ideal scenario is that the implementation detail is added in an automatic process of program translation and code generation. The approach of domain-specific program generation has lately received increasing attention in the area of computational science and engineering. We introduce the new code generation framework Athariac. Its goal is to support the quick implementation of a language processing and program optimization platform for a given DSL based on stepwise term-rewriting. We demonstrate the framework's use on our DSL ExaSlang for the specification and optimization of multigrid solvers. On this example, we provide evidence of Athariac's potential for making domain-specific software engineering more productiv}, author = {Schmitt, Christian and Kronawitter, Stefan and Hannig, Frank and Teich, Jürgen and Lengauer, Christian}, doi = {10.1109/JPROC.2018.2854229}, faupublication = {yes}, journal = {Proceedings of the IEEE}, keywords = {ExaStencils; ExaSlang; DSL; Domain-Specific Languages; Code Generation; Program Transformation; Compiler}, pages = {1969-1984}, peerreviewed = {Yes}, title = {{Automating} the {Development} of {High}-{Performance} {Multigrid} {Solvers}}, volume = {106}, year = {2018} } @inproceedings{faucris.123961464, author = {Dinkel, Thomas and Haubelt, Christian and Heinkel, Ulrich and Schlichter, Thomas and Teich, Jürgen}, booktitle = {Methoden und Beschreibungssprachen zur Modellierung und Verifikation von Schaltungen und Systemen}, date = {2005-04-06/2005-04-07}, faupublication = {yes}, peerreviewed = {unknown}, title = {{Automatische} {Verification} von {ADeVA}-{Spezifikationen}}, venue = {Munich}, year = {2005} } @inproceedings{faucris.117898484, author = {Stechele, Walter and Bringmann, Oliver and Ernst, Rolf and Herkersdorf, Andreas and Hojenski, Katharina and Janacik, Peter and Rammig, Franz and Teich, Jürgen and Wehn, Norbert and Zeppenfeld, Johannes and Ziener, Daniel}, booktitle = {Proceedings of Zuverlässigkeit und Entwurf (ZuD 2007)}, faupublication = {yes}, note = {UnivIS-Import:2015-04-16:Pub.2007.tech.IMMD.inform.autono}, pages = {137-138}, title = {{Autonomic} {MPSoCs} for {Reliable} {Systems}}, venue = {Munich}, year = {2007} } @inproceedings{faucris.276825729, abstract = {

Many Big Data applications include the processing of data streams on semi-structured data formats such as JSON.

A disadvantage of these formats, however, is that applications may require a significant portion of their processing time to unselectively parse all data.

As a remedy, so-called raw filters have been introduced in the past, aiming to reduce the data load before the costly parsing stage.

Since filtering unparsed data can also become very costly, raw filters can be designed to filter data approximately, in the sense that they allow false positives to occur, in order to be implemented efficiently.

While previously proposed CPU-based solutions are restricted to just string filtering, FPGA approaches have recently been proposed with much more expressive raw filters, allowing also to capture numbers and structural relationships.

Yet, as a consequence of the variety of filter possibilities as well as the limited amount of resources available on FPGAs, the selection of optimal filters before their deployment has been identified as a complex problem resulting in the potential need to select less expressive filters in order to consume fewer resources.

Many Big Data applications (e.g., stream processing) operate on incoming real-time data over long, potentially unlimited time periods.

As a consequence, the conditions for which such a filter is optimized can change over time after its deployment.

In this realm, this paper presents a new methodology which automatically adapts the hardware accelerator for raw filtering by means of dynamic hardware reconfiguration.

Data is sampled on-the-fly during operation and used by an optimizer-in-the-loop to select and generate a raw filter with optimized selectivity for these data samples.

As the optimizer has to take into account the resource costs of the hardware accelerator, we introduce models to estimate the resource costs in order to avoid performing a full synthesis.

The filter selection problem can thus be solved within a few minutes with results close to the accurate resource cost estimation.

If the selectivity of a query changes over time, such as seasonal differences in the analysis of IoT data, the system can auto-tune its filter to adapt to the situation.

Depending on the query and the variability of inherent data changes, significant improvements in the amount of filtered data are presented, resulting in a significant parsing speedup in comparison to a state-of-the-art non-adaptive approach.

}, author = {Hahn, Tobias and Wildermann, Stefan and Teich, Jürgen}, booktitle = {IEEE Proceedings of the 32nd International Conference on Field Programmable Logic and Applications}, date = {2022-08-29/2022-09-02}, doi = {10.1109/FPL57034.2022.00036}, faupublication = {yes}, keywords = {Raw Filtering; Auto-Tuning; Self-Evolution; JSON; FPGA; HW/SW-Co-Design}, peerreviewed = {unknown}, title = {{Auto}-{Tuning} of {Raw} {Filters} for {FPGAs}}, venue = {Belfast, United Kingdom}, year = {2022} } @inproceedings{faucris.124202364, abstract = {

The parallelization of programs and distributing their workloads to multiple threads can be a challenging task. In addition to multi-threading, harnessing vector units in CPUs proves highly desirable. However, employing vector units to speed up programs can be quite tedious. Either a program developer solely relies on the auto-vectorization capabilities of the compiler or he manually applies vector intrinsics, which is extremely error-prone, difficult to maintain, and not portable at all.

Based on whole-function vectorization, a method to replace control flow with data flow, we propose auto-vectorization techniques for image processing DSLs in the context of source-to-source compilation. The approach does not require the input to be available in SSA form. Moreover, we formulate constraints under which the vectorization analysis and code transformations may be greatly simplified in the context of image processing DSLs. As part of our methodology, we present control flow to data flow transformation as a source-to-source translation. Moreover, we propose a method to efficiently analyze algorithms with mixed bit-width data types to determine the optimal SIMD width, independently of the target instruction set. The techniques are integrated into an open source DSL framework. Subsequently, the

vectorization capabilities are compared to a variety of existing state-of-the-art C/C++ compilers. A geometric mean speedup of up to 3.14 is observed for benchmarks taken from ISPC and image processing, compared to non-vectorized executions.

}, author = {Reiche, Oliver and Kobylko, C. and Hannig, Frank and Teich, Jürgen}, booktitle = {Proceedings of the 18th International Conference on Languages, Compilers, Tools, and Theory for Embedded Systems (LCTES)}, date = {2017-06-21/2017-06-22}, doi = {10.1145/3078633.3081039}, faupublication = {yes}, keywords = {Domain-Specific Languages, Vectorization, Image Processing}, pages = {21 - 30}, peerreviewed = {Yes}, publisher = {ACM}, title = {{Auto}-vectorization for {Image} {Processing} {DSLs}}, venue = {Barcelona}, year = {2017} } @inproceedings{faucris.119369404, abstract = {In this paper, we propose a novel system-level power modeling methodology that allows for very fast joint power-performance evaluation at specification phase. This methodology adopts approximately-timed task-accurate performance models and augments them with power-state-based power models to enable efficient simulation. A flexible method is also proposed to model complex dynamic power management policies so that their effects can be evaluated. We validate the accuracy of our methodology by comparing simulation results with measurements on a real mobile phone platform. Experimental results show that the simulated power profile matches very well with the measurements and it only takes about 100 ms to simulate a 20 ms GSM paging burst use case. © 2012 Springer-Verlag.}, author = {Xu, Yang and Rosales, Rafael and Wang, Bo and Streubühr, Martin and Hasholzner, Ralph and Haubelt, Christian and Teich, Jürgen}, booktitle = {Proceedings of the International Conference on Architecture of Computing Systems (ARCS)}, doi = {10.1007/978-3-642-28293-5{\_}4}, faupublication = {yes}, isbn = {9783642282928}, pages = {37-49}, peerreviewed = {unknown}, title = {{A} very fast and quasi-accurate power-state-based system-level power modeling methodology}, venue = {Munich}, year = {2012} } @inproceedings{faucris.119665744, author = {Fickenscher, Jörg and Hannig, Frank and Teich, Jürgen and Bouzouraa, Mohammed Essayed}, booktitle = {4th International Conference on Vehicle Technology and Intelligent Transport Systems (VEHITS)}, date = {2018-03-16/2018-03-18}, doi = {10.5220/0006677302980306}, faupublication = {yes}, isbn = {978-989-758-293-6}, pages = {298-306}, peerreviewed = {Yes}, publisher = {SCITEPRESS}, title = {{Base} {Algorithms} of {Environment} {Maps} and {Efficient} {Occupancy} {Grid} {Mapping} on {Embedded} {GPUs}}, venue = {Funchal, Madeira, Portugal}, year = {2018} } @inproceedings{faucris.122103784, author = {Haubelt, Christian and Koch, Dirk and Teich, Jürgen}, booktitle = {Proceedings of the Third International Workshop on Systems, Architectures, Modeling and Simulation (SAMOS'03)}, date = {2003-07-21/2003-07-23}, faupublication = {yes}, pages = {18-22}, peerreviewed = {unknown}, title = {{Basic} {OS} {Support} for {Distributed} {Reconfigurable} {Hardware}}, venue = {Samos}, year = {2003} } @inproceedings{faucris.109510324, abstract = {While recent research is mainly focused on the OS support for a single reconfigurable node, this paper presents a general approach to manage distributed reconfigurable hardware. The most outstanding properties of these systems are the ability of reconfiguration, hardware task migration, and fault tolerance. This paper presents first ideas of an operating system (OS) for such architectures. Furthermore, a prototype implementation consisting of four fully connected FPGAs will be presented. © Springer-Verlag Berlin Heidelberg 2004.}, author = {Haubelt, Christian and Koch, Dirk and Teich, Jürgen}, booktitle = {Proceedings of the International Workshop on Systems, Architectures, Modeling and Simulation (SAMOS'04)}, date = {2004-07-19/2004-07-21}, faupublication = {yes}, pages = {30-38}, peerreviewed = {unknown}, title = {{Basic} {OS} support for distributed reconfigurable hardware}, url = {https://www.scopus.com/inward/record.url?partnerID=HzOxMe3b&scp=35048878778&origin=inward}, venue = {Samos}, volume = {3133}, year = {2004} } @incollection{faucris.281549226, author = {Teich, Jürgen and Esper, Khalil and Falk, Joachim and Pourmohseni, Behnaz and Schwarzer, Tobias and Wildermann, Stefan}, booktitle = {Invasive Computing}, doi = {10.25593/978-3-96147-571-1}, editor = {Jürgen Teich, Jörg Henkel, Andreas Herkersdorf}, faupublication = {yes}, isbn = {978-3-96147-571-1}, pages = {69-95}, peerreviewed = {No}, publisher = {FAU University Press}, title = {{Basics} of {Invasive} {Computing}}, year = {2022} } @inproceedings{faucris.118484784, abstract = {Sorting is one of the most investigated tasks computers are used for. Up to now, not much research has been put into increasing the flexibility and performance of sorting applications by applying reconfigurable computer systems. There are parallel sorting algorithms (sorting circuits) which are highly suitable for VLSI hardware realization and which outperform sequential sorting methods applied on traditional software processors by far. But usually they require a large area that increases with the number of keys to be sorted. This drawback concerns ASIC and statically reconfigurable systems. In this paper, we present a way to adopt the well-known Bitonic sorting method to dynamically reconfigurable systems such that this drawback is overcome. We present a detailed description of the design and actual implementation, and we present experimental results of our approach to show its benefits in performance and the trade-offs of our approach. © 2011 IEEE.}, address = {New York, NY, USA}, author = {Angermeier, Josef and Sibirko, Eugen and Wanka, Rolf and Teich, Jürgen}, booktitle = {Proc. IEEE International Symposium on Parallel and Distributed Processing Workshops and Phd Forum (IPDPSW)}, date = {2011-05-16/2011-05-20}, doi = {10.1109/IPDPS.2011.164}, faupublication = {yes}, isbn = {978-1-61284-425-1}, note = {UnivIS-Import:2015-04-16:Pub.2011.tech.IMMD.infalg.{\_}biton}, pages = {314-317}, publisher = {IEEE Press}, title = {{Bitonic} {Sorting} on {Dynamically} {Reconfigurable} {Architectures}}, venue = {Anchorage, AL}, year = {2011} } @inproceedings{faucris.117899584, abstract = {In this paper, we present hardware decompression accelerators for bridging the gap between high speed FPGA configuration interfaces and slow configuration memories. We discuss different compression algorithms suitable for a decompression on FPGAs as well as on CPLDs with respect to the achievable compression ratio, throughput, and hardware overhead. This leads to various decompressor implementations with one capable to decompress at high data rates of up to 400 megabytes per second while only requiring slightly more than a hundred look-up tables. Furthermore, we present a sophisticated configuration bitstream benchmark. © 2007 IEEE.}, address = {New York}, author = {Koch, Dirk and Beckhoff, Christian and Teich, Jürgen}, booktitle = {Proc. of the IEEE International Conference on Field-Programmable Technology 2007}, date = {2007-12-12/2007-12-14}, doi = {10.1109/FPT.2007.4439245}, faupublication = {yes}, note = {UnivIS-Import:2015-04-16:Pub.2007.tech.IMMD.inform.bitstr}, pages = {161-168}, publisher = {IEEE Press}, title = {{Bitstream} {Decompression} for {High} {Speed} {FPGA} {Configuration} from {Slow} {Memories}}, venue = {Kokurakita, Kitakyushu}, year = {2007} } @inproceedings{faucris.122649164, abstract = {We present a new methodology for controlling the space-time behavior of VLSI and FPGA-based processor arrays. The main idea is to generate simple local control elements which take control over the activeness of each attached processor element. Each control element thereby propagates a "start" and a "stop execution" signal to its neighbors. We show that our control mechanism is much more efficient than existing approaches such as [10, 17] because 1) only two control signals (start/stop) are required, 2) no extension of the computation space is necessary. 3) By the local propagation of just one start/stop signal, energy is saved as processing elements are only active between the time they have received the start signal and the time they have received the stop signal. Our methodology is applicable to one- and multi-dimensional processor arrays and is based on local control signal propagation. We provide a theoretical analysis of the overhead caused by the control structure.}, author = {Bednara, Marcus and Hannig, Frank and Teich, Jürgen}, booktitle = {Proc. 35th IEEE Asilomar Conf. on Signals, Systems and Computers}, editor = {Matthews M.B.}, faupublication = {no}, keywords = {Control; FPGA; Hardware mapping; Processor arrays}, pages = {468-474}, peerreviewed = {unknown}, title = {{Boundary} control: {A} new distributed control architecture for space-time transformed ({VLSI}) processor arrays}, url = {https://www.scopus.com/inward/record.url?partnerID=HzOxMe3b&scp=0035573058&origin=inward}, venue = {Pacific Grove, CA}, volume = {1}, year = {2001} } @inproceedings{faucris.118704784, abstract = {Matlab/Simulink is today's de-facto standard for model-based design in domains such as control engineering and signal processing. Particular strengths of Simulink are rapid design and algorithm exploration. Moreover, commercial tools are available to generate embedded C or HDL code directly from a Simulink model. On the other hand, Simulink models are purely functional models and, hence, designers cannot seamlessly consider the architecture that a Simulink model is later implemented on. In particular, it is not possible to explore the different architectural alternatives and investigate the arising interactions and side-effects directly within Simulink. To benefit from Matlab/Simulink's algorithm exploration capabilities and overcome the outlined drawbacks, this work introduces a model transformation framework that converts a Simulink model to an executable specification, written in an actor-oriented modeling language. This specification then serves as the input of well-established Electronic System Level (ESL) design flows that, e. g., enables Design Space Exploration (DSE) and automatic code generation for both hardware and software. We also present a validation technique that considers the functional correctness by comparing the original Simulink model with the generated specification in a co-simulation environment. The co-simulation can also be used to evaluate the performance of implementation candidates during DSE. As case study, we present and investigate a torque vectoring application from an electric automotive vehicle. © 2013 European Electronic Chips & Systems design Initiative - ECSI.}, address = {New York, NY, USA}, author = {Zhang, Liyuan and Glaß, Michael and Ballmann, Nils and Teich, Jürgen}, booktitle = {Proc. Forum on Specification & Design Languages}, date = {2013-09-24/2013-09-26}, faupublication = {yes}, isbn = {978-2-9530504-8-6}, keywords = {Actor-oriented Design; Code Generation; DSE; Matlab/Simulink; System-level Validation; SystemC}, note = {UnivIS-Import:2015-04-16:Pub.2013.tech.IMMD.inform.bridgi}, pages = {1-8}, publisher = {IEEE Press}, title = {{Bridging} {Algorithm} and {ESL} {Design}: {Matlab}/{Simulink} {Model} {Transformation} and {Validation}}, venue = {Paris}, year = {2013} } @incollection{faucris.122918444, author = {Zhang, Liyuan and Glaß, Michael and Ballmann, Nils and Teich, Jürgen}, booktitle = {Languages, Design Methods, and Tools for Electronic System Design}, doi = {10.1007/978-3-319-06317-1{\_}10}, editor = {Marie-Minerve Louërat, Torsten Maehne}, faupublication = {yes}, isbn = {978-3-319-06316-4}, pages = {189-206}, peerreviewed = {unknown}, publisher = {Springer}, title = {{Bridging} {Algorithm} and {ESL} {Design}: {MATLAB}/{Simulink} {Model} {Transformation} and {Validation}}, year = {2015} } @inproceedings{faucris.117710164, address = {Dagstuhl, Germany}, author = {Göhringer, Diana and Majer, Mateusz and Teich, Jürgen}, booktitle = {Proceedings of the Dagstuhl Seminar Nº 06141 on Dynamically Reconfigurable Architectures}, faupublication = {yes}, note = {UnivIS-Import:2015-04-16:Pub.2006.tech.IMMD.inform.bridgi}, pages = {7 - 18}, publisher = {Internationales Begegnungs- und Forschungszentrum fuer Informatik (IBFI), Schloss Dagstuhl, Germany}, series = {Dagstuhl Seminar Proceedings}, title = {{Bridging} the {Gap} between {Relocation} and {Available} {Technology}: {The} {Erlangen} {Slot} {Machine}}, url = {http://drops.dagstuhl.de/opus/volltexte/2006/736}, venue = {Dagstuhl}, year = {2006} } @incollection{faucris.121831864, abstract = {In the context of digital signal processing, synchronous data flow (SDF) graphs [12] are widely used for specification. For these, so called single appearance schedules provide program memory-optimal uniprocessor implementations. Here, buffer memory minimized schedules are explored among these using an Evolutionary Algorithm (EA). Whereas for a restricted class of graphs, there exist optimal polynomial algorithms, these are not exact and may provide poor results when applied to arbitrary, i.e., randomly generated graphs. We show that a careful EA implementation may outperform these algorithms by sometimes orders of magnitude.}, address = {Amsterdam, The Netherlands}, author = {Bhattacharyya, Shuvra S. and Teich, Jürgen and Zitzler, Eckart}, booktitle = {Parallel Problem Solving from Nature (PPSN'98)}, faupublication = {no}, isbn = {3540650784}, note = {UnivIS-Import:2015-04-20:Pub.1998.tech.IMMD.inform.buffer}, pages = {292-301}, peerreviewed = {unknown}, publisher = {Springer-verlag}, title = {{Buffer} {Memory} {Optimization} in {DSP} {Applications} - {An} {Evolutionary} {Approach}}, year = {1998} } @inproceedings{faucris.121759264, author = {Bhattacharyya, Shuvra and Teich, Jürgen and Zitzler, Eckart}, booktitle = {Springer Lecture Notes in Computer Science (LNCS) 1498}, faupublication = {no}, pages = {292-301}, peerreviewed = {unknown}, publisher = {Springer-Verlag}, title = {{Buffer} {Memory} {Optimization} in {DSP} {Applications} - {An} {Evolutionary} {Approach}}, venue = {Amsterdam}, year = {1998} } @article{faucris.123726064, abstract = {With the term Architecture/Compiler Co-exploration, we denote the problem of simultaneously optimizing an application-specific instruction set processor (ASIP) architecture as well as its generated compiler. In this paper, we characterize the design space of both compiler frontend (intermediate code optimization) and backend (changes of the machine model) and present the workflow of our framework BUILDABONG. The project consists of four phases: (a) architecture entry and composition, (b) automatic simulator generation, (c) compiler generation (in particular, retargeting), and (d) automatic architecture/compiler design space exploration. We demonstrate the feasibility of our approach by a detailed case study.}, author = {Fischer, Dirk and Teich, Jürgen and Thies, M. and Weper, R.}, doi = {10.1142/S0218126603000799}, faupublication = {no}, journal = {Journal of Circuits Systems and Computers}, keywords = {Architecture/compiler co-design; ASIP; Multiobjective design space exploration; Retargetable compilation}, pages = {353-375}, peerreviewed = {Yes}, title = {{Buildabong}: {A} framework for architecture/compiler co-exploration for {ASIPs}}, volume = {12}, year = {2003} } @inproceedings{faucris.118417904, author = {Fischer, Dirk and Teich, Jürgen and Trinkert, Stefan and Weper, Ralph}, booktitle = {Proc. DSP-Deutschland 2000}, faupublication = {no}, note = {UnivIS-Import:2015-04-16:Pub.2000.tech.IMMD.inform.builda}, pages = {153-162}, title = {{BUILDABONG}: {A} {Rapid} {Prototyping} {Environment} for {ASIPs}}, venue = {Munich}, year = {2000} } @inproceedings{faucris.123122824, abstract = {In recent years, road vehicles have seen a tremendous increase on driver assistance systems like lane departure warning, traffic sign recognition, or pedestrian detection. The development of efficient and cost-effective electronic control units that meet the necessary real-time performance for these systems is a complex challenge. Often, Electronic System-Level design tackles the challenge by simulation-based performance evaluation, although, the quality of system-level performance simulation approaches is not yet evaluated in detail. In this paper, we present the calibration and validation of a system-level performance simulation model. For evaluation, an automotive pedestrian detection algorithm is studied. Especially the varying number of pedestrians has a significant impact to the system performance and makes the prediction of execution time difficult. As test cases we used typical sequences and corner cases recorded by an experimental car. Our evaluation results indicate that prediction of execution times with an average error of 3.1% and a maximum error of 7.9% can be achieved. Thereby, simulated and measured execution times of a software implementation are compared. © 2011 IEEE.}, author = {Kiesel, Rainer and Streubühr, Martin and Haubelt, Christian and Löhlein, Otto and Teich, Jürgen}, booktitle = {Proceedings of the International Conference on Embedded Computer Systems: Architectures, Modeling and Simulation (SAMOS XI)}, doi = {10.1109/SAMOS.2011.6045460}, faupublication = {yes}, isbn = {9781457708008}, pages = {182-189}, title = {{Calibration} and validation of software performance models for pedestrian detection systems}, venue = {Samos}, year = {2011} } @inproceedings{faucris.118281284, abstract = {As the number of electronic components in automobiles steadily increases, the demand for higher communication bandwidth also rises dramatically. Instead of installing new wiring harnesses and new bus structures, it would be useful, if already available structures could be used, but driven at higher data rates. In this paper, we a) propose an extension of the well-known Controller Area Network (CAN) called CAN+ with which the target rate of 1Mbit/s can be increased up to 16 times. Moreover, b) existing CAN hardware and devices not dedicated to these boosted data rates can still be used without interferences on communication. The major idea is a change of the protocol. In particular, we exploit the fact that data could be sent in time slots, where CAN-conform nodes don't listen. Finally, c) an implementation of this type of overclocking scheme on an FPGA is provided to prove the feasibility and the impressive through-put gains. © 2009 EDAA.}, author = {Ziermann, Tobias and Wildermann, Stefan and Teich, Jürgen}, booktitle = {Proc. Design, Automation and Test in Europe}, date = {2009-04-20/2009-04-24}, faupublication = {yes}, note = {UnivIS-Import:2015-04-16:Pub.2009.tech.IMMD.inform.canane}, pages = {1088-1093}, title = {{CAN}+: {A} {New} {Backward}-compatible {Controller} {Area} {Network} ({CAN}) {Protocol} with up to 16x {Higher} {Data} {Rates}}, venue = {Nice}, year = {2009} } @inproceedings{faucris.203968026, abstract = {Approximate computing allows to tackling conflicting objectives, such as power and accuracy of computations. In this paper, we first describe how knowledge of stimuli's specific features can help in quantifying and improving power savings by means of approximate computing. We investigate FPGA implementations of several approximate circuits and compare their power consumption with non-approximating versions. In particular, we study approximate arithmetics and a clock-gate based technique called memoization. Moreover, we compare the accuracy of estimation techniques for power consumption evaluation versus real measurements under controlled environments. We also experimentally quantify the relationship between switching activity and power consumption. Two important results are concluded from our investigations: (1) Approximate arithmetics do not necessarily consume less power than conventional circuits, whereas memoization techniques can, in fact, reduce power consumption. (2) Simulation-based power evaluation for approximate FPGA implementations can reach fidelity values up to about 89% in input-dependent power characteristics. Yet, to evaluate absolute savings, measurements are require}, author = {Echavarria Gutiérrez, Jorge Alfonso and Schütz, Katja and Becher, Andreas and Wildermann, Stefan and Teich, Jürgen}, booktitle = {Proceedings of IEEE International Conference on Electronics Circuits and Systems}, date = {2018-12-09/2018-12-12}, doi = {10.1109/icecs.2018.8618062}, faupublication = {yes}, keywords = {Switching Activity; Power Consumption; Approximate Computing}, peerreviewed = {Yes}, title = {{Can} {Approximate} {Computing} {Reduce} {Power} {Consumption} on {FPGAs}?}, venue = {Bordeaux}, year = {2018} } @inproceedings{faucris.118175904, author = {Ziermann, Tobias and Teich, Jürgen and Wildermann, Stefan}, booktitle = {Proc. 9th Stuttgart, International Symposium}, faupublication = {yes}, note = {UnivIS-Import:2015-04-16:Pub.2009.tech.IMMD.inform.cantec}, pages = {327-339}, title = {{CAN}+: {Techniques} and {Prototype} for {Achieving} {Increased} {Data} {Rates} on the {Basis} of {Common} {CAN} {Bus} {Structures}}, venue = {Stuttgart}, year = {2009} } @inproceedings{faucris.118771664, abstract = {Networks on Chip (NoC) come along with increased complexity from the implementation and management perspective. This leads to higher energy consumption and programming complexity of NoC architectures. This work introduces communication aware programming to address communication resource management and efficient programming of NoC architectures. A programming interface is introduced to express communication requirements at the language level. These requirements are evaluated by an operating system component, which configures the communication hardware accordingly. The proposed concept enables an intuitive use of NoC features like end-to-end connections and Direct Memory Access (DMA). The presented results show that communication aware programming can improve performance and energy consumption. Copyright 2014 ACM.}, address = {New York, NY, USA}, author = {Heisswolf, Jan and Zaib, Aurang and Zwinkau, Andreas and Kobbe, Sebastian and Weichslgartner, Andreas and Teich, Jürgen and Henkel, Jörg and Snelting, Gregor and Herkersdorf, Andreas and Becker, Jürgen}, booktitle = {Proc. of The 51st Annual Design Automation Conference (DAC)}, date = {2014-06-02/2014-06-05}, doi = {10.1145/2593069.2593103}, faupublication = {yes}, isbn = {978-1-4799-3017-3}, keywords = {Communication; Invasive; Many-core; Network on chip; X10}, note = {UnivIS-Import:2015-04-16:Pub.2014.tech.IMMD.inform.capcom}, pages = {105:1-105:6}, publisher = {ACM Press}, title = {{CAP}: {Communication} {Aware} {Programming}}, venue = {San Francisco, CA}, year = {2014} } @inproceedings{faucris.119391624, author = {Fickenscher, Jörg and Schlumberger, Jens and Hannig, Frank and Bouzouraa, Mohammed Essayed and Teich, Jürgen}, booktitle = {Design, Automation and Test in Europe (DATE)}, date = {2018-03-19/2018-03-23}, doi = {10.23919/DATE.2018.8342050}, editor = {IEEE}, faupublication = {yes}, isbn = {978-3-9819263-1-6}, pages = {443-448}, peerreviewed = {unknown}, publisher = {IEEE}, title = {{Cell}-based {Update} {Algorithm} for {Occupancy} {Grid} {Maps} and new {Hybrid} {Map} for {ADAS} on {Embedded} {GPUs}}, venue = {Dresden, Germany}, year = {2018} } @book{faucris.106505784, address = {New York, U.S.A.,}, editor = {Bhattacharyya, Shuvra S. and Deprettere, Ed and Teich, Jürgen}, faupublication = {yes}, isbn = {0-8247-4711-9}, note = {UnivIS-Import:2015-05-08:Pub.2004.tech.IMMD.inform.energy}, publisher = {Marcel Dekker}, series = {Signal Processing and Communication}, title = {{Chapter} 6 in {Domain}-{Specific} {Processors}: {Systems}, {Architectures}, {Modeling}, and {Simulation}}, year = {2004} } @inproceedings{faucris.271130609, abstract = {FPGAs offer fast and reliable near-data processing and are therefore suitable candidates for implementing IoT and edge computing systems. As they are usually deployed in exposed locations, they are vulnerable to physical attacks, especially Side-Channel Analysis (SCA).
In this paper, we characterize side-channels and how they can be exploited for SCA on FPGA-based off-the-shelf boards, i.e. without having to make any modifications to the board, hardware, or software. The basic requirement for any kind of SCA is that the individual Cryptographic Operations (COs) in the side-channel traces can be detected.
To this end, we apply a SCA for semi-automatic CO detection that can be generically applied off-the-shelf to a wide variety of boards. Additionally, we introduce a new metric called Signal of COs to Noise Ratio (SCONR), that allows to quantify the pronouncedness of COs versus noise in a side channel. We then evaluate side channels measured on three different boards containing Xilinx 7 series FPGAs. We further investigate the influence of other sources of noise and how much they affect the attackability of a system.
Our results show that FPGAs have a high vulnerability to SCA in general and that even noise from an operating system will not hinder the recording and finding of COs in an automated fashion as long as there are no countermeasures in place. Finally, SCONR converges after fewer recorded traces and gives a clearer indication whether a side channel is susceptible to this type of automated attack than leakage assessment techniques such as TVL}, author = {Trautmann, Jens and Teich, Jürgen and Wildermann, Stefan}, booktitle = {30th IEEE International Symposium on Field-Programmable Custom Computing Machines}, date = {2022-05-15/2022-05-18}, doi = {10.1109/FCCM53951.2022.9786190}, faupublication = {yes}, keywords = {Side-Channel Analysis; Off-The-Shelf FPGA Boards; Hardware Security; Security Assessment}, peerreviewed = {Yes}, publisher = {IEEE}, title = {{Characterization} of {Side} {Channels} on {FPGA}-based {Off}-{The}-{Shelf} {Boards} against {Automated} {Attacks}}, venue = {New York City}, year = {2022} } @inproceedings{faucris.258215539, abstract = {FPGA-based Physical Unclonable Functions (PUFs) have emerged as a viable alternative to permanent key storage by turning inaccuracies during the manufacturing process of a chip into a unique, FPGA-intrinsic secret. However, many fixed PUF designs may suffer from unsatisfactory statistical properties in terms of uniqueness, uniformity, and robustness. Moreover, a PUF signature may alter over time due to aging
or changing operating conditions, rendering a PUF insecure in the worst case. As a remedy, we propose CHOICE, a novel class of FPGA-based PUF designs with tunable uniqueness and reliability characteristics. By the use of addressable shift registers available on an FPGA, we show that a wide configuration space
for adjusting a device-specific PUF response is obtained without any sacrifice of randomness. In particular, we demonstrate the concept of address-tunable propagation delays, whereby we are able to increase or decrease the probability of obtaining ’1’s in the PUF response. Experimental evaluations on a group of six 28 nm Xilinx Artix-7 FPGAs show that CHOICE PUFs provide a large range of configurations to allow a fine-tuning to an
average uniqueness between 49% and 51%, while simultaneously achieving bit error rates below 1.5%, thus outperforming state-of-the-art PUF designs. Moreover, with only a single FPGA slice per PUF bit, CHOICE is one of the smallest PUF designs currently available for FPGAs.
latency, and throughput. Our approach is shown to achieve higher speed-ups than similar processor-based approaches while consuming lower amounts of resources.With the abundance of computing devices in our everyday life such as IoT devices, improving their security has become a number one priority. While the major focus lies on software security, hardware vulnerabilities are often not considered. Here, particularly side-channel attacks pose a realistic threat to such systems. However, conducting Side-Channel Analysis (SCA) to evaluate those threats currently requires deep expert knowledge, a lab environment, and numerous manual steps. Therefore, it is often ignored in security considerations. In this paper, we analyze the challenges when conducting SCA on consumer-grade devices using template-matching based triggering techniques. By introducing a three-staged framework called CORSICA, we elaborate the obstacles and deficiencies of current state-of-the-art techniques and provide potential solutions for them. Moreover, we validate our claims by introducing a method for semi-automatic extraction of a waveform template of an AES 128 encryption that can be used in combination with a template-matching triggering system. This extraction is based on generic meta information and is demonstrated on a consumer-grade ARM processor board.}, author = {Schlumberger, Jens and Wildermann, Stefan and Teich, Jürgen}, booktitle = {11th IFIP International Conference on New Technologies, Mobility and Security (NTMS)}, date = {2021-04-19/2021-04-21}, doi = {10.1109/NTMS49979.2021.9432644}, edition = {2}, editor = {IEEE}, faupublication = {yes}, keywords = {Side-Channel Analysis; Off-The-Shelves Devices; template-matching trigger}, pages = {1-5}, peerreviewed = {Yes}, title = {{CORSICA}: {A} {Framework} for {Conducting} {Real}-{World} {Side}-{Channel} {Analysis}}, venue = {Paris, France}, year = {2021} } @inproceedings{faucris.123727384, abstract = {Covering the whole set of Pareto-optimal solutions is a desired task of multi-objective optimization methods. Because in general it is not possible to determine this set, a restricted amount of solutions are typically delivered in the output to decision makers. In this paper, we propose a new method using multi-objective particle swarm optimization to cover the Pareto-optimal front. The method works in two phases. In phase 1 the goal is to obtain a good approximation of the Pareto-front. In a second run subswarms are generated to cover the Pareto-front. The method is evaluated using different test functions and compared with an existing covering method using a real world example in antenna design.}, author = {Mostaghim, Sanaz and Teich, Jürgen}, booktitle = {Proceedings of the Congress on Evolutionary Computation (CEC '04)}, date = {2004-06-20/2004-06-23}, faupublication = {yes}, isbn = {9780780385153}, pages = {1404-1411}, peerreviewed = {unknown}, title = {{Covering} {Pareto}-optimal fronts by subswarms in multi-objective particle swarm optimization}, url = {https://www.scopus.com/inward/record.url?partnerID=HzOxMe3b&scp=4344649636&origin=inward}, venue = {Portland, OR}, volume = {2}, year = {2004} } @inproceedings{faucris.116230224, abstract = {We present new hierarchical set oriented methods for the numerical solution of multi-objective optimization problems. These methods are based on a generation of collections of subdomains (boxes) in parameter space which cover the entire set of Pareto points. In the course of the subdivision procedure these coverings get tighter until a desired granularity of the covering is reached. For the evaluation of these boxes we make use of evolutionary algorithms. We propose two particular strategies and discuss combinations of those which lead to a better algorithmic performance. Finally we illustrate the efficiency of our methods by several examples. © Springer-Verlag Berlin Heidelberg 2003.}, author = {Dellnitz, Michael and Mostaghim, Sanaz and Schütze, Oliver and Teich, Jürgen}, booktitle = {Proceedings of the Second International Conference on Evolutionary Multi-Criterion Optimization (EMO)}, faupublication = {no}, note = {UnivIS-Import:2015-04-16:Pub.2003.tech.IMMD.inform.coveri}, pages = {118-132}, publisher = {Springer-verlag}, series = {Covering Pareto Sets by Multilevel}, title = {{Covering} {Pareto} {Sets} by {Multilevel} {Evolutionary} {Subdivision} {Techniques}}, venue = {Faro}, year = {2003} } @inproceedings{faucris.319617636, abstract = {Physical Side-Channel Analysis (SCA) is often restricted to less complex devices, such as microcontrollers, as more feature-rich targets, like microprocessor systems, contain possibly multiple sources of systemic noise that influence side-channels and their waveforms non-deterministically in the view of an observer. Thus, locating these waveforms using established waveform matching techniques and subsequent data association becomes impossible. This impedes the application of SCA techniques relying on the relationship of operation waveforms and their processed data. In order to address this issue, this paper presents the CRESTS algorithm, that is capable
of locating instances of target operations in larger side-channel traces affected by systemic noise. This enables to extract the waveforms of the relevant operations from the trace and, by maintaining
their overall sequence of occurrence, associate them with their corresponding data to then apply established SCA techniques. The effectiveness of our approach is evaluated by applying CRESTS on
the widespread XTS-AES algorithm running on a Beaglebone Black System-on-Chip (SoC) target platform.In this paper, we present a parallel waveform-matching architecture capable of performing high-speed waveform matching on a high-end FPGA-based digitizer. We also present a workflow for calibrating the waveform-matching system to the specific pattern of the CO in the presence of hardware restrictions provided by the FPGA hardware. Our implementation enables waveform matching at 10 GS/s, offering a speedup of 50x compared to the fastest state-of-the-art implementation known to us. We demonstrate how to apply the technique for attacking the widespread XTS-AES algorithm using waveform matching to recover the encrypted tweak even in the presence of so-called systemic noise. fronts of Pareto-optimal (T,E,C) solutions. This exploration helps to determine the limits of scalability of the presented tiled CGRA accelerator architectures in terms of throughput, the number of parallel layers that can be simultaneously processed, and memory requirements. Finally, we provide an evaluation of energy savings achievable on our architecture in comparison to implementations that execute sequentially a CNN layer-by-layer. In experiments, it is shown that layer-parallel processing is able to reduce energy consumption E by 3.6×, hardware cost C by 1.2×, and increase the achievable throughput T by 6.2× for MobileNet.

This paper presents a generalization of this scheme to arbitrary bit widths and number of multiplications. We also demonstrate that the previously proposed approach leads to errors (Mean Absolute Error (MAE) = 0.37). Furthermore, we explain where these errors come from and how they can be corrected. On top, we introduce a novel approximate method called "Overpacking" which allows to squeeze even more multiplications into a single DSP at the cost of small errors (MAE = 0.47). Overpacking allows to squeeze six 4-bit multiplications into a single DSP compared to just four in the literature. Finally, we introduce an alternative method for packing multiple small-bit width additions into a single 48-bit accumulator for use in applications such as Spiking Neural Networks. average latency of inference, but they cannot reduce the longest-path latency of inference. In contrast, we present a novel approach of dynamic filter pruning that utilizes explainable AI along with early coarse prediction in the intermediate layers of a CNN. This coarse prediction is performed using a simple branch that is trained to perform top-k classification. The branch either predicts the output class with high confidence, in which case the rest of the computations are left out. Alternatively, the branch predicts the output class to be within a subset of possible output classes. After this coarse prediction, only those filters that are important for this subset of classes are then evaluated. The importances of filters for each output class are obtained using explainable AI. Using this concept of dynamic pruning, we are able not only to reduce the average latency of inference, but also the longest-path latency of inference. Our proposed architecture for dynamic pruning can be deployed on different hardware platform}, address = {New York, NY, United States}, author = {Sabih, Muhammad and Hannig, Frank and Teich, Jürgen}, booktitle = {Proceedings of the 2nd European Workshop on Machine Learning and Systems (EuroMLSys)}, date = {2022-04-05/2022-04-08}, doi = {10.1145/3517207.3526982}, faupublication = {yes}, isbn = {978-1-4503-9254-9}, keywords = {Explainable AI; Filter Pruning; CNN; Dynamic Pruning}, pages = {109–115}, peerreviewed = {Yes}, publisher = {Association for Computing Machinery (ACM)}, title = {{DyFiP}: {Explainable} {AI}-based {Dynamic} {Filter} {Pruning} of {Convolutional} {Neural} {Networks}}, venue = {Rennes, France}, year = {2022} } @book{faucris.123437864, editor = {Becker, Jürgen and Teich, Jürgen and Athanas, Peter and Brebner, Gordon}, faupublication = {yes}, peerreviewed = {automatic}, series = {Proceedings of the Dagstuhl Seminar Nº 06141}, title = {{Dynamically} {Reconfigurable} {Architectures}}, year = {2006} } @article{faucris.111412664, author = {Bergmann, Neil and Platzner, Marco and Teich, Jürgen}, faupublication = {yes}, journal = {EURASIP Journal on Embedded Systems}, note = {UnivIS-Import:2015-03-09:Pub.2007.tech.IMMD.inform.dynami}, pages = {Article ID 28405, 2 pages}, peerreviewed = {unknown}, title = {{Dynamically} {Reconfigurable} {Architectures}}, volume = {2007}, year = {2007} } @book{faucris.115078084, abstract = {Dynamically Reconfigurable Systems is the first ever to focus on the emerging field of Dynamically Reconfigurable Computing Systems. While programmable logic and design-time configurability are well elaborated and covered by various texts, this book presents a unique overview over the state of the art and recent results for dynamic and run-time reconfigurable computing systems. Reconfigurable hardware is not only of utmost importance for large manufacturers and vendors of microelectronic devices and systems, but also a very attractive technology for smaller and medium-sized companies. Hence, Dynamically Reconfigurable Systems also addresses researchers and engineers actively working in the field and provides them with information on the newest developments and trends in dynamic and run-time reconfigurable systems. © Springer Science+Business Media B.V. 2010.}, address = {Heidelberg}, author = {Platzner, Marco and Teich, Jürgen and Wehn, Norbert}, doi = {10.1007/978-90-481-3485-4}, faupublication = {yes}, isbn = {978-90-481-3484-7}, note = {UnivIS-Import:2015-04-02:Pub.2010.tech.IMMD.inform.dynami}, pages = {267}, publisher = {Springer}, title = {{Dynamically} {Reconfigurable} {Systems} - {Architectures}, {Design} {Methods} and {Applications}}, year = {2010} } @inproceedings{faucris.122200144, abstract = {This paper presents a novel application-driven and resource-aware mapping methodology for tree-structured streaming applications onto NoCs. This includes strategies for mapping the source of streaming applications (seed point selection), as well as embedding strategies so that each process autonomously embeds its own succeeding tasks. The proposed embedding strategies only consider the local view of neighboring cells on the NoC which allows to significantly reduce computation and monitoring overhead. Our vision is that this approach facilitates self-organizing embedded systems that provide the flexibility and fault-tolerance required in future silicon technologies. The results provided in this paper show that our local and decentralized algorithms can compete with previously presented global and centralized algorithms. © 2011 ACM.}, address = {New York, NY, USA}, author = {Weichslgartner, Andreas and Wildermann, Stefan and Teich, Jürgen}, booktitle = {Proc. Fifth ACM/IEEE International Symposium on Networks-on-Chip}, date = {2011-05-01/2011-05-04}, doi = {10.1145/1999946.1999979}, faupublication = {yes}, keywords = {decentralized mapping; graph embedding; Networks-on-Chip}, note = {UnivIS-Import:2015-04-16:Pub.2011.tech.IMMD.inform.dynami}, pages = {201-208}, publisher = {IEEE Computer Society}, title = {{Dynamic} {Decentralized} {Mapping} of {Tree}-{Structured} {Applications} on {NoC} {Architectures}}, venue = {Pittsburgh}, year = {2011} } @article{faucris.119377984, abstract = {We propose a new method for defragmenting the module layout of a reconfigurable device, enabled by a novel approach for dealing with communication needs between relocated modules and with inhomogeneities found in commonly used FPGAs. Our method is based on dynamic relocation of module positions during runtime, with only very little reconfiguration overhead; the objective is to maximize the length of contiguous free space that is available for new modules. We describe a number of algorithmic aspects of good defragmentation, and present an optimization method based on tabu search. Experimental results indicate that we can improve the quality of module layout by roughly 50% over the static layout. Among other benefits, this improvement avoids unnecessary rejections of modules. © 2012 ACM.}, author = {Fekete, Sandor P. and Kamphans, Tom and Schweer, Nils and Tessars, Christopher and Van Der Veen, Jan C. and Angermeier, Josef and Koch, Dirk and Teich, Jürgen}, doi = {10.1145/2209285.2209287}, faupublication = {yes}, journal = {ACM Transactions on Reconfigurable Technology and Systems}, keywords = {Complexity; Defragmentation; Dynamic reconfiguration; Hardware/software codesign; Local search; Physical sorting; Reconfigurable devices}, note = {UnivIS-Import:2015-03-09:Pub.2012.tech.IMMD.inform.dynami{\_}1}, pages = {1-20}, peerreviewed = {unknown}, title = {{Dynamic} {Defragmentation} of {Reconfigurable} {Devices}}, volume = {5}, year = {2012} } @inproceedings{faucris.118051604, abstract = {In this paper we present an extension of the class of piece-wise linear algorithms (PLAs) in order to model one type of dynamic data dependencies. This extension significantly increases the range of applications which can be parallelized and mapped to massively parallel processor arrays. For instance, a lot of computational intensive applications for video and image processing consist of nested loop programs with only few and simple run-time dependent conditionals. Furthermore, we outline in which case these extensions can directly used - with slight changes - within traditional mapping methodologies based on loop parallelization in the polytope model. Additionally, we outline future research directions in the case existing methods will be inefficient.}, author = {Hannig, Frank and Teich, Jürgen}, booktitle = {Proceedings of the Fourth International Conference on Parallel Computing in Electrical Engineering (PARELEC 2004)}, date = {2004-09-07/2004-09-10}, faupublication = {yes}, isbn = {0-7695-2080-4}, note = {UnivIS-Import:2015-04-16:Pub.2004.tech.IMMD.inform.dynami{\_}5}, pages = {79 - 84}, title = {{Dynamic} {Piecewise} {Linear}/{Regular} {Algorithms}}, venue = {Dresden}, year = {2004} } @inproceedings{faucris.122672484, abstract = {In this paper, we present fault-tolerance strategies for implementing passive replication techniques in networked embedded systems based on TDMA-communication such as FlexRay busses. In particular, we assume that processes are replicated at different nodes for tolerating node failures. Hence, if one node fails another node can execute the process and requires the bandwidth for transmitting those messages created by the process over the bus medium. Two concepts are introduced to solve this problem: 1.) to replicate not only the processes but also the messages and to reserve the required bandwidth a priori at design time or 2.) to reconfigure the TDMA-schedule and assign the bandwidth dynamically to the nodes. Obviously, reserving bandwidth for each failure case might lead to a huge overhead and to long response times. Therefore, we provide different reconfiguration strategies for the recently developed FlexRay bus. Moreover, the timing behavior as well as the implementation overhead are evaluated with the help of an experimental setup consisting of five FlexRay nodes. © 2008 Springer-Verlag Berlin Heidelberg.}, author = {Brendle, Robert and Streichert, Thilo and Koch, Dirk and Haubelt, Christian and Teich, Jürgen}, booktitle = {Proceedings of the International Conference on Architecture of Computing Systems (ARCS 2008)}, date = {2008-02-25/2008-02-28}, doi = {10.1007/978-3-540-78153-0{\_}10}, faupublication = {yes}, isbn = {9783540781523}, pages = {117-129}, peerreviewed = {unknown}, title = {{Dynamic} reconfiguration of {FlexRay} schedules for response time reduction in asynchronous fault-tolerant networks}, venue = {Dresden}, year = {2008} } @inproceedings{faucris.203550247, author = {Henkel, Jörg and Teich, Jürgen and Wildermann, Stefan and Amrouch, Hussam}, booktitle = {Proceedings of International Conference On Computer Aided Design 2018}, date = {0018-11-05/0018-11-08}, doi = {10.1145/3240765.3243471}, faupublication = {yes}, isbn = {978-1-4503-5950-4}, pages = {60:1 - 60:6}, peerreviewed = {unknown}, title = {{Dynamic} {Resource} {Management} for {Heterogeneous} {Many}-{Cores}}, venue = {San Diego, CA}, year = {2018} } @inproceedings{faucris.121286924, abstract = {In this paper, a new methodology for tolerating link as well as node defects in self-adaptive reconfigurable networks will be presented. Currently, networked embedded systems need a certain level of redundancy for each node and link in order to tolerate defects and failures in a network. Due to monetary constraints as well as space and power limitations, the replication of each node and link is not an option in most embedded systems. Therefore, we will present a hardware/software partitioning algorithm for reconfigurable networks that optimizes the task binding onto resources at runtime such that node/link defects can be handled and data traffic on links between computational nodes will be minimized. This paper presents a new hardware/software partitioning algorithm, an experimental evaluation and for demonstrating the readability, an implementation on a network of FPGA-based boards. Copyright 2006 ACM.}, author = {Streichert, Thilo and Strengert, Christian and Haubelt, Christian and Teich, Jürgen}, booktitle = {In Proceedings of SBCCI 2006}, faupublication = {yes}, isbn = {9781595934796}, keywords = {Fault-tolerance; Online hardware/software partitioning; Reconfigurable system}, pages = {38-43}, peerreviewed = {unknown}, title = {{Dynamic} task binding for hardware/software reconfigurable networks}, url = {https://www.scopus.com/inward/record.url?partnerID=HzOxMe3b&scp=33750914830&origin=inward}, venue = {Ouro Preto}, volume = {2006}, year = {2006} } @inproceedings{faucris.118572784, abstract = {For medical imaging applications, a timely execution of tasks is essential. Hence, running multiple applications on the same system, scheduling with the capability of task preemption and prioritization becomes mandatory. Using GPUs as accelerators in this domain, imposes new challenges since GPU's common FIFO scheduling does not support task prioritization and preemption. As a remedy, this paper investigates the employment of resource management and scheduling techniques for applications from the medical domain for GPU accelerators. A scheduler supporting both, priority-based and LDF scheduling is added to the system such that high-priority tasks can interrupt tasks already enqueued for execution. The scheduler is capable of utilizing multiple GPUs in a system to minimize the average response time of applications. Moreover, it supports simultaneous execution of multiple tasks to hide data transfers latencies. We show that the scheduler interrupts scheduled and already enqueued applications to fulfill the timing requirements of high-priority dynamic tasks. © 2012 Springer-Verlag.}, address = {New York, NY, USA}, author = {Membarth, Richard and Lupp, Jan-Hugo and Hannig, Frank and Teich, Jürgen and Körner, Mario and Eckert, Wieland}, booktitle = {Proc. of the 25th International Conference on Architecture of Computing Systems (ARCS)}, date = {2012-02-28/2012-03-02}, doi = {10.1007/978-3-642-28293-5{\_}13}, faupublication = {yes}, isbn = {978-3-642-28292-8}, keywords = {CUDA; Dynamic task-scheduling; GPU; Medical imaging; Resource management}, note = {UnivIS-Import:2015-04-16:Pub.2012.tech.IMMD.inform.dynami}, pages = {147-159}, publisher = {Springer-verlag}, title = {{Dynamic} {Task}-{Scheduling} and {Resource} {Management} for {GPU} {Accelerators} in {Medical} {Imaging}}, venue = {Munich}, year = {2012} } @inproceedings{faucris.118488304, abstract = {CAN bus systems are used in many industrial control applications, particularly automotive. Due to growing system and functional requirements, the low capacity of the CAN bus and usually strict conditions under which it is used in realtime applications, applicability of CAN bus is severely limited. The paper presents an approach for achieving high utilization and breathes new life to CAN bus based systems by proposing a dynamic offset adaptation algorithm for scheduling messages and improving message response times without any changes to a standard CAN bus. This simple algorithm, which runs on all nodes of the system, results in excellent average response times at all loads and makes the approach particularly attractive for soft real-time systems. We demonstrate the performance improvement of the proposed approach by comparisons to other approaches and introduce a new performance measure in the form of a rating function. © 2011 EDAA.}, address = {New York, NY, USA}, author = {Ziermann, Tobias and Salcic, Zoran and Teich, Jürgen}, booktitle = {Proc. of DATE}, date = {2011-03-14/2011-03-18}, faupublication = {yes}, isbn = {978-1-61284-208-0}, keywords = {CAN; Controller Area Network; distributed embedded systems; response time; WCRT}, note = {UnivIS-Import:2015-04-16:Pub.2011.tech.IMMD.inform.dynoaa}, pages = {269-272}, publisher = {IEEE Press}, title = {{DynOAA} - {Dynamic} {Offset} {Adaptation} {Algorithm} for {Improving} {Response} {Times} of {CAN} {Systems}}, url = {http://ieeexplore.ieee.org/xpls/abs{\_}all.jsp?arnumber=5763272}, venue = {Grenoble}, year = {2011} } @inproceedings{faucris.116964584, abstract = {A new paradigm to support the communication among modules dynamically placed on a reconfigurable device at run-time is presented. Based on the network on chip (NoC) infrastructure, we developed a dynamic communication infrastructure as well as routing methodologies capable to handle routing in a NoC with obstacles created by dynamically placed components. We prove the unrestricted reachability of components and pins, the deadlock-freeness and we finally show the feasibility of our approach by means on real life example applications. © 2005 IEEE.}, author = {Ahmadinia, Ali and Bobda, Christophe and Fekete, Sandor P. and Majer, Mateusz and Teich, Jürgen and Van Der Veen, Jan C.}, booktitle = {Proceedings of the International Conference on Field-Programmable Logic and Applications}, date = {2005-08-24/2005-08-26}, doi = {10.1109/FPL.2005.1515715}, faupublication = {yes}, note = {UnivIS-Import:2015-04-16:Pub.2005.tech.IMMD.inform.dynoca}, pages = {153-158}, title = {{DyNoC}: {A} {Dynamic} {Infrastructure} for {Communication} in {Dynamically} {Reconfigurable} {Devices}}, venue = {Tampere}, year = {2005} } @inproceedings{faucris.111278904, address = {Berlin}, author = {Reimann, Felix and Kern, Andreas and Haubelt, Christian and Streichert, Thilo and Teich, Jürgen}, booktitle = {GMM-Fachbericht - Automotive meets Electronics}, faupublication = {yes}, isbn = {978-3-8007-3236-4}, note = {UnivIS-Import:2019-08-15:Pub.2010.tech.IMMD.inform.echtze}, pages = {9-14}, peerreviewed = {unknown}, publisher = {VDE VERLAG}, title = {{Echtzeitanalyse} {Ethernet}-basierter {E}/{E}-{Architekturen} im {Automobil}}, venue = {Dortmund, Germany}, year = {2010} } @inproceedings{faucris.122148004, abstract = {For the next generations of Processor-Arrays-on-Chip(e. g., coarse-grained reconfigurable or programmable arrays)— including more than 100s to 1000s of processing elements—it is very important to keep the on-chip configuration/instruction memories as small as possible. Hence, compilers must take into account the scarceness of available instruction memory and create the code as compact as possible [1]. However, Very Long Instruction Word (VLIW) processors have thewell-known problem that compilers typically produce lengthycodes. A lot of unnecessary code is produced due to unused Functional Units (FUs) or repeating operations for single FUs in instruction sequences. Techniques like software pipeliningcan be used to improve the utilization of the FUs, yet with therisk of code explosion [2] due to the overlapped scheduling of multiple loop iterations or other control flow statements. This is, where our proposed Orthogonal Instruction Processing (OIP) architecture (see Fig. 1) shows benefits in reducing the code size of compute-intensive loop programs. The idea is, contrary to lightweight VLIW processors used in arrays like Tightly Coupled Processor Arrays (TCPAs) [4], to equip each FU with its own instruction memory, branch unit, andprogram counter, but still let the FUs share the register files as well as input and output signals. This enables a processorto orthogonally execute a loop program. Each FU can execute its own sub-program while exchanging data over the register files. The branch unit and its instruction format have to beslightly changed by introducing a counter to each instructionthat determines how often the instruction is repeated until the specified branch is executed. This enables repeating instructions without repeating them in the code. Those kind of processors have to be carefully programmed, e. g., to not run into data dependency problems while optimizing throughput. For solving this resource-constrained modulo scheduling problem, we use techniques based on mixed integer linear programming [5], [3].
Obviously, the modifications of the processor produce architectural overhead in form of additional branch units and an increase of instruction memory compared to the lightweight VLIW processors. Thus, we created an analytical model of both the lightweight VLIW processor and our proposed architecture to analyze the overhead. The model gives an upper bound of the hardware costs and the memory consumption according to [7]. We examined the HW costs of a lightweight VLIW processor with different instruction memory lengths mVLIW and compared them to our OIP processor with varying instruction ratios IR and thus instruction memory lengths mOIP of each FU’s instruction memory. In the examination, we covered processors containingten FUs and averaged the HW costs over the instruction ratio. Figure 2 shows that the overhead is negligible as soon as we are able to reduce program sizes to 50 % (i. e., IR = 2), whichis usually achieved by our compile}, author = {Brand, Marcel and Hannig, Frank and Tanase, Alexandru-Petru and Teich, Jürgen}, booktitle = {2017 IEEE 28th International Conference on Application-specific Systems, Architectures and Processors (ASAP)}, date = {2017-07-10/2017-07-12}, doi = {10.1109/ASAP.2017.7995282}, editor = {IEEE}, faupublication = {yes}, isbn = {978-1-5090-4825-0}, pages = {207}, peerreviewed = {unknown}, title = {{Efficiency} in {ILP} {Processing} by {Using} {Orthogonality}}, venue = {Seattle}, year = {2017} } @inproceedings{faucris.265210755, author = {Groth, Stefan and Teich, Jürgen and Hannig, Frank}, booktitle = {Proceedings of the 24th International Workshop on Software and Compilers for Embedded Systems}, date = {2021-11-01/2021-11-02}, doi = {10.1145/3493229.3493305}, faupublication = {yes}, keywords = {Tensor core unit, Convolution, Image processing, Parallel algorithm}, pages = {1–6}, peerreviewed = {Yes}, title = {{Efficient} {Application} of {Tensor} {Core} {Units} for {Convolving} {Images}}, venue = {Eindhoven (NL)}, year = {2021} } @inproceedings{faucris.122672924, abstract = {In this paper, we propose an efficient modeling approach that permits simulation-based performance evaluation of MPSoCs at Electronic System Level (ESL). The approach is based on a SystemC simulation framework and allows for evaluating timing effects from resource contention when mapping applications to MPSoC platforms. The abstraction level used for modeling timing corresponds to approximately-timed transaction level models. This allows for an accurate performance modeling, including temporal effects from preemptive processor scheduling and bus arbitration. However, in contrast to standard SystemC TLM, application mapping and platform models are configurable and, thus, enable design space exploration at ESL. We use a Motion-JPEG decoder application to illustrate and assess the benefits of the proposed approach.}, author = {Streubühr, Martin and Gladigau, Jens and Haubelt, Christian and Teich, Jürgen}, booktitle = {Forum on specification and Design Languages 2009}, date = {2009-09-22/2009-09-24}, faupublication = {yes}, isbn = {9782953050417}, peerreviewed = {unknown}, title = {{Efficient} approximately-timed performance modeling for architectural exploration of {MPSoCs}}, url = {https://www.scopus.com/inward/record.url?partnerID=HzOxMe3b&scp=77951563792&origin=inward}, venue = {Sophia Antipolis}, year = {2009} } @book{faucris.117079424, abstract = {In this chapter, we propose an efficient modeling approach that permits simulation-based performance evaluation of MPSoCs at Electronic System Level (ESL). The approach is based on a SystemC simulation framework and allows for evaluating timing effects from resource contention when mapping applications to MPSoC platforms. The abstraction level used for modeling timing corresponds to approximately-timed communication in transaction level models. This allows for an accurate performance modeling, including temporal effects from preemptive processor scheduling and bus arbitration. However, in contrast to standard OSCI TLM, application mapping and platform models are configurable and, thus, enable design space exploration at ESL. A Motion-JPEG decoder application is used to illustrate and assess the benefits of our approach. © 2010 Springer Science+Business Media B.V.}, author = {Streubühr, Martin and Gladigau, Jens and Haubelt, Christian and Teich, Jürgen}, doi = {10.1007/978-90-481-9304-2{\_}4}, faupublication = {yes}, isbn = {9789048193035}, keywords = {Electronic system level; Performance modeling; Simulation; SystemC}, pages = {59-72}, peerreviewed = {unknown}, title = {{Efficient} approximately-timed performance modeling for architectural exploration of {MPSoCs}}, year = {2010} } @inproceedings{faucris.116045204, abstract = {In this paper, we present an efficient exploration algorithm for architecture/compiler co-designs of application-specific instruction-set processors. The huge design space is spanned by processor architecture parameters as well as different compiler optimization strategies. The objective space is multi-dimensional including conflicting objectives such as hardware cost, execution time and code size. The goal of the presented exploration algorithm is to determine the set of Pareto-optimal designs and compiler settings for a given benchmark program. In a case study, while exploring Pareto-optimal designs for a given DSP benchmark program, we show that for a realistic architecture family, the huge search space may be reduced dramatically using proper techniques to prune search spaces that may not contain Pareto-optimal solutions. Finally, we analyse and present solutions on what is the best architecture for a mixture of benchmark programs, i.e., what are the best architecture/compiler co-designs to execute the DSPstone benchmark. Copyright 2002 ACM.}, author = {Fischer, Dirk and Teich, Jürgen and Thies, M. and Weper, R.}, booktitle = {ACM SIG Proceedings of the International Conference on Compilers, Architecture, and Synthesis for Embedded Systems}, date = {2002-10-08/2002-10-11}, doi = {10.1145/581630.581635}, faupublication = {no}, keywords = {Architecture/compiler codesign; ASIP; Multiobjective design space exploration; Retargetable compilation}, note = {UnivIS-Import:2015-04-16:Pub.2002.tech.IMMD.inform.effici}, pages = {27-34}, title = {{Efficient} {Architecture}/{Compiler} {Co}-{Exploration} for {ASIPs}}, venue = {Grenoble}, year = {2002} } @article{faucris.121332684, abstract = {In this paper, we present a novel methodology to calculate the Arithmetic Error Rate (AER) for deterministic approximate adder architectures where the calculation of each output bit is restricted to a subset of the input bits, denoted as visibilities. Such architectures have been widely proposed in the literature and are, e.g., obtained when splitting the carry chain in a carry-propagate adder into partitions each computed by a separate parallel adder, or when removing carry-lookahead operators in a parallel prefix adder. Our contribution is a unified calculus for determining the arithmetic error rate for (a) such deterministic approximate adder architectures making use of visibilities and (b) the general case of arbitrarily (also non-uniformly) distributed input bit}, author = {Echavarria Gutiérrez, Jorge Alfonso and Wildermann, Stefan and Potwigin, Eduard and Teich, Jürgen}, doi = {10.1109/LES.2017.2760922}, faupublication = {yes}, journal = {IEEE Embedded Systems Letters}, keywords = {Approximate Adders; Arithmetic Error Rate; Approximate Computing}, peerreviewed = {Yes}, title = {{Efficient} {Arithmetic} {Error} {Rate} {Calculus} for {Visibility} {Reduced} {Approximate} {Adders}}, year = {2017} } @article{faucris.265047612, author = {Khosravi, Faramarz and Raß, Alexander and Teich, Jürgen}, doi = {10.1145/3469801}, faupublication = {yes}, journal = {ACM Transactions on Evolutionary Learning and Optimization}, pages = {1-26}, peerreviewed = {Yes}, title = {{Efficient} {Computation} of {Probabilistic} {Dominance} in {Multi}-objective {Optimization}}, volume = {1}, year = {2021} } @article{faucris.111829344, abstract = {Processor array architectures are optimal platforms for computationally intensive applications. Such architectures are characterized by hierarchies of parallelism and memory structures, i.e. processor arrays apart from different levels of cache have a large number of processing elements (PE) where each PE can further contain sub-word parallelism. In order to handle large scale problems, balance local memory requirements with I/O-bandwidth, and use different hierarchies of parallelism and memory, one needs a sophisticated transformation called hierarchical partitioning. Innately the applications are data flow dominant and have almost no control flow, but the application of hierarchical partitioning techniques has the disadvantage of a more complex control flow. In a previous paper, the authors presented first time a methodology for the automated control path synthesis for the mapping of partitioned algorithms onto processor arrays. However, the control path contained complex multiplication and division operators. In this paper, we propose a significant extension to the methodology which reduces the hardware cost of the global controller and memory address generators by avoiding these costly operations. © 2006 Elsevier B.V. All rights reserved.}, author = {Dutta, Hritam and Hannig, Frank and Ruckdeschel, Holger and Teich, Jürgen}, doi = {10.1016/j.sysarc.2006.10.009}, faupublication = {yes}, journal = {Journal of Systems Architecture}, note = {UnivIS-Import:2015-03-09:Pub.2007.tech.IMMD.inform.effici}, pages = {300-309}, peerreviewed = {Yes}, title = {{Efficient} {Control} {Generation} for {Mapping} {Nested} {Loop} {Programs} onto {Processor} {Arrays}}, volume = {53}, year = {2007} } @inproceedings{faucris.318934337, abstract = {This paper proposes an approach for efficiently deploying neural network (NN) models on highly resource-constrained microcontroller architectures, particularly AURIX TC3xx microcontrollers. Here, compression and optimization techniques of the NN model are required to reduce execution time while maintaining accuracy on the target microcontroller. Furthermore, especially on AURIX TriCores that are frequently used in the automotive domain, there is a lack of support for automatic conversion and deployment of pretrained NN models. In this work, we present an approach that fills this gap, enabling the conversion and deployment of so-called thermal neural networks on AURIX TC3xx microcontrollers for the first time. Experimental results on three different NN types show that, when pruning of convolutional neural networks is applied, we can achieve a speedup of 2.7× compared to state-of-the-art thermal neural networks.6 logic gates circuit complexity, implementing a signal processing algorithm, can be analyzed for power and area within less than a minute on a standard consumer PC. Since currently there exists no published architecture-level power/area estimation framework for coarse-grained, software-programmable architectures, our work tries to address this shortcoming. Copyright © 2011 American Scientific Publishers All rights reserved.}, author = {Kissler, Dmitrij and Hannig, Frank and Teich, Jürgen}, doi = {10.1166/jolpe.2011.1114}, faupublication = {yes}, journal = {Journal of Low Power Electronics}, keywords = {Architectural-level power analysis; Coarse-grained reconfigurable architectures (CGRA); Design space exploration}, note = {UnivIS-Import:2015-04-14:Pub.2011.tech.IMMD.inform.effici{\_}2}, pages = {29-40}, peerreviewed = {unknown}, title = {{Efficient} {Evaluation} of {Power}/{Area}/{Latency} {Design} {Trade}-offs for {Coarse}-{Grained} {Reconfigurable} {Processor} {Arrays}}, volume = {7}, year = {2011} } @inproceedings{faucris.119372264, abstract = {In this paper we present a new approach for generating high-speed optimized event-driven instruction set level simulators for adaptive massively parallel processor architectures. The simulator generator is part of a methodology for the systematic mapping, evaluation, and exploration of massively parallel processor architectures that are designed for special purpose applications in the world of embedded computers. The generation of high-speed cycle-accurate simulators is of utmost importance here, because they are directly used both for parallel processor architecture debugging and evaluation purposes, as well as during time-consuming architecture/compiler co-exploration. We developed a modeling environment which automatically generates a C++ simulation model either from a graphical input or directly from an XML-based architecture description. Here, we focus on the underlying event-driven simulation model and present our modeling environment, in particular the features of the graphical parallel processor architecture editor and the automatic instruction set level simulator generator. Finally, in a case-study, we demonstrate the pertinence of our approach by simulating different processor arrays. The superior performance of the generated simulators compared to existing simulators and simulator generation approaches is shown.}, author = {Kupriyanov, Olexiy and Kissler, Dmitrij and Hannig, Frank and Teich, Jürgen}, booktitle = {Proceedings of the 10th International Workshop on Software and Compilers for Embedded Systems (SCOPES)}, doi = {10.1145/1269843.1269854}, faupublication = {yes}, keywords = {Embedded tools; Modeling; Processor arrays; Simulation}, pages = {71-80}, peerreviewed = {unknown}, title = {{Efficient} event-driven simulation of parallel processor architectures}, venue = {Nice}, volume = {235}, year = {2007} } @article{faucris.284006916, abstract = {Spiking neural networks (SNNs) compute in an event-based manner to achieve a more efficient computation than standard neural networks. In SNNs, neuronal outputs are not encoded as real-valued activations but as sequences of binary spikes. The motivation of using SNNs over conventional neural networks is rooted in the special computational aspects of spike-based processing, especially the high degree of sparsity of spikes. Well-established implementations of convolutional neural networks (CNNs) feature large spatial arrays of processing elements (PEs) that remain highly underutilized in the face of activation sparsity. We propose a novel architecture optimized for the processing of convolutional SNNs (CSNNs) featuring a high degree of sparsity. The proposed architecture consists of an array of PEs of the size of the kernel of a convolution and an intelligent spike queue that provides a high PE utilization. A constant flow of spikes is ensured by compressing the feature maps into queues that can then be processed spike-by-spike. This compression is performed at run-time, leading to a self-timed schedule. This allows the processing time to scale with the number of spikes. Also, a novel memory organization scheme is introduced to efficiently store and retrieve the membrane potentials of the individual neurons using multiple small parallel on-chip RAMs. Each RAM is hardwired to its PE, reducing switching circuitry. We implemented the proposed architecture on an FPGA and achieved a significant speedup compared to previously proposed SNN implementations (~10 times) while needing less hardware resources and maintaining a higher energy efficiency (~15 times}, author = {Sommer, Jan and Özkan, Mehmet Akif and Keszöcze, Oliver and Teich, Jürgen}, doi = {10.1109/TCAD.2022.3197512}, faupublication = {yes}, journal = {IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems}, keywords = {Event-based processing; field-programmable gate array (FPGA); hardware acceleration; spiking convolutional neural networks (SNNs)}, pages = {3767 - 3778}, peerreviewed = {Yes}, title = {{Efficient} {Hardware} {Acceleration} of {Sparsely} {Active} {Convolutional} {Spiking} {Neural} {Networks}}, volume = {41}, year = {2022} } @inproceedings{faucris.277693698, abstract = {Spiking neural networks (SNNs) compute in an event-based manner to achieve a more efficient computation than standard neural networks. In SNNs, neuronal outputs are not encoded as real-valued activations but as sequences of binary spikes. The motivation of using SNNs over conventional neural networks is rooted in the special computational aspects of spike-based processing, especially the high degree of sparsity of spikes. Well-established implementations of convolutional neural networks (CNNs) feature large spatial arrays of processing elements (PEs) that remain highly underutilized in the face of activation sparsity. We propose a novel architecture optimized for the processing of convolutional SNNs (CSNNs) featuring a high degree of sparsity. The proposed architecture consists of an array of PEs of the size of the kernel of a convolution and an intelligent spike queue that provides a high PE utilization. A constant flow of spikes is ensured by compressing the feature maps into queues that can then be processed spike-by-spike. This compression is performed at run-time, leading to a self-timed schedule. This allows the processing time to scale with the number of spikes. Also, a novel memory organization scheme is introduced to efficiently store and retrieve the membrane potentials of the individual neurons using multiple small parallel on-chip RAMs. Each RAM is hardwired to its PE, reducing switching circuitry. We implemented the proposed architecture on an FPGA and achieved a significant speedup compared to previously proposed SNN implementations (~10 times) while needing less hardware resources and maintaining a higher energy efficiency (~15 times}, author = {Sommer, Jan and Özkan, Mehmet Akif and Keszöcze, Oliver and Teich, Jürgen}, booktitle = {International Conference on Hardware/Software Codesign and System Synthesis (CODES+ISSS)}, date = {2022-10-07/2022-10-14}, doi = {10.1109/tcad.2022.3197512}, faupublication = {yes}, keywords = {Event-based processing; field-programmable gate array (FPGA); hardware acceleration; spiking convolutional neural networks (SNNs)}, peerreviewed = {Yes}, series = {Proceedings of the International Conference on Hardware/Software Codesign and System Synthesis (CODES+ISSS)}, title = {{Efficient} {Hardware} {Acceleration} of {Sparsely} {Active} {Convolutional} {Spiking} {Neural} {Networks}}, venue = {Shanghai}, year = {2022} } @inproceedings{faucris.117079644, abstract = {Progress in reconfigurable hardware technology allows the implementation of complete SoCs in today's FPGAs. In the context design for reliability, software checkpointing is an effective methodology to cope with faults. In this paper, we systematically extend the concept of checkpointing known from software systems to hardware tasks running on reconfigurable devices. We will classify different mechanisms for hardware checkpointing and present formulas for estimating the hardware overhead. Moreover, we will reveal a tool that takes over the burden of modifying hardware modules for checkpointing. Post-synthesis results of applying our methodology to different hardware accelerators will be presented and the results will be compared with the theoretical estimations. Copyright 2007 ACM.}, author = {Koch, Dirk and Haubelt, Christian and Teich, Jürgen}, booktitle = {Proceedings of the 15th ACM/SIGDA International Symposium on Field-Programmable Gate Arrays (FPGA 2007)}, date = {2007-02-18/2007-02-20}, doi = {10.1145/1216919.1216950}, faupublication = {yes}, isbn = {9781595936004}, keywords = {Checkpointing; State access}, pages = {188-196}, peerreviewed = {unknown}, title = {{Efficient} hardware checkpointing: {Concepts}, overhead analysis, and implementation}, venue = {Monterey, CA}, year = {2007} } @inproceedings{faucris.117080304, abstract = {Starting Electronic System Level (ESL) design flows with executable High-Level Models (HLMs) has the potential to sustainably improve productivity. However, writing good HLMs for complex systems is still a challenging task. In the context of network controller design, modeling complexity has two major sources: (1) the functionality to handle a single connection, and (2) the number of connections to be handled in parallel. In this paper, we will propose an efficient actor-oriented modeling approach for complex systems by (1) integrating hierarchical FSMs into dynamic dataflow models, and (2) providing new channel types to allow concurrent processing of multiple connections. We will show the applicability of our proposed modeling approach to real-world system designs by presenting results from modeling and simulating a network controller for the Parallel Sysplex architecture used in IBM System z mainframes. © 2010 EDAA.}, author = {Zebelein, Christian and Falk, Joachim and Haubelt, Christian and Teich, Jürgen and Dorsch, Rainer}, booktitle = {Proceedings of Design, Automation and Test in Europe (DATE 2010)}, date = {2010-03-08/2010-03-12}, faupublication = {yes}, isbn = {9783981080162}, pages = {1189-1194}, peerreviewed = {unknown}, title = {{Efficient} high-level modeling in the networking domain}, url = {https://www.scopus.com/inward/record.url?partnerID=HzOxMe3b&scp=77953092909&origin=inward}, venue = {Dresden}, year = {2010} } @article{faucris.224634960, author = {Heidorn, Christian and Witterauf, Michael and Hannig, Frank and Teich, Jürgen}, doi = {10.17706/jcp.14.8.541-556}, faupublication = {yes}, journal = {Journal of Computers}, pages = {541-556}, peerreviewed = {Yes}, title = {{Efficient} {Mapping} of {CNNs} onto {Tightly} {Coupled} {Processor} {Arrays}}, volume = {14}, year = {2019} } @inproceedings{faucris.118282384, abstract = {In the last decade, there has been a dramatic growth in research and development of massively parallel commodity graphics hardware both in academia and industry. Graphics card architectures provide an optimal platform for parallel execution of many number crunching loop programs from fields like image processing, linear algebra, etc. However, it is hard to efficiently map such algorithms to the graphics hardware even with detailed insight into the architecture. This paper presents a multiresolution image processing algorithm and shows the efficient mapping of this type of algorithms to the graphics hardware. Furthermore, the impact of execution configuration is illustrated and a method is proposed to determine the best configuration offline in order to use it at run-time. Using CUDA as programming model, it is demonstrated that the image processing algorithm is significantly accelerated and that a speedup of up to 33x can be achieved on NVIDIA's Tesla C870 compared to a parallelized implementation on a Xeon Quad Core. © 2009 Springer Berlin Heidelberg.}, address = {Berlin / Heidelberg}, author = {Membarth, Richard and Hannig, Frank and Dutta, Hritam and Teich, Jürgen}, booktitle = {Proceedings of the 9th International Workshop on Systems, Architectures,Modeling, and Simulation (SAMOS)}, date = {2009-07-20/2009-07-23}, doi = {10.1007/978-3-642-03138-0{\_}31}, faupublication = {yes}, isbn = {978-3-642-03137-3}, note = {UnivIS-Import:2015-04-16:Pub.2009.tech.IMMD.inform.effici{\_}0}, pages = {277-288}, publisher = {Springer-verlag}, series = {Lecture Notes in Computer Science (LNCS)}, title = {{Efficient} {Mapping} of {Multiresolution} {Image} {Filtering} {Algorithms} on {Graphics} {Processors}}, venue = {Island of Samos}, volume = {5657}, year = {2009} } @incollection{faucris.119080104, author = {Membarth, Richard and Dutta, Hritam and Hannig, Frank and Teich, Jürgen}, booktitle = {Transactions on High-Performance Embedded Architectures and Compilers V}, doi = {10.1007/978-3-662-58834-5{\_}1}, faupublication = {yes}, isbn = {978-3-662-58833-8}, note = {UnivIS-Import:2015-04-20:Pub.2011.tech.IMMD.inform.effici}, pages = {1-20}, peerreviewed = {Yes}, publisher = {Springer}, series = {Lecture Notes in Computer Science (LNCS)}, title = {{Efficient} {Mapping} of {Streaming} {Applications} for {Image} {Processing} on {Graphics} {Cards}}, volume = {11225}, year = {2019} } @inproceedings{faucris.237093403, abstract = {

Hipacc is a domain-specific language for ease of programming image processing applications on hardware accelerators such as GPUs. It relieves the burden of manually porting algorithms to hardware for developers with the help of domain- and architecture-specific knowledge. One fundamental operation in image processing is reduction. Global reduction operators are the building blocks of many widely used algorithms, including image normalization, similarity estimation, etc. This paper presents an efficient approach to perform parallel reductions on GPUs with Hipacc. Our proposed approach benefits from the continuous effort of performance and programmability improvement by hardware vendors, for example, by utilizing the latest low-level primitives from Nvidia. Results show our approach achieves a speedup of up to 3.43 over an existing Hipacc implementation with traditional optimization methods, and a speedup of up to 9.02 over an implementation using the Thrust library from Nvidia.

}, author = {Qiao, Bo and Reiche, Oliver and Özkan, Mehmet Akif and Teich, Jürgen and Hannig, Frank}, booktitle = {Proceedings of the 23rd International Workshop on Software and Compilers for Embedded Systems (SCOPES)}, date = {2020-05-25/2020-05-26}, doi = {10.1145/3378678.3391885}, faupublication = {yes}, isbn = {978-1-4503-7131-5/20/05}, keywords = {Domain-Specific Languages, Parallel Reduction, GPUs}, pages = {58-61}, peerreviewed = {Yes}, title = {{Efficient} {Parallel} {Reduction} on {GPUs} with {Hipacc}}, venue = {Sankt Goar}, year = {2020} } @inproceedings{faucris.122677324, abstract = {This paper presents techniques for generating on-chip buses suitable for dynamically integrating hardware modules into an FPGA-based SoC by partial reconfiguration. The buses permit direct connections of master and slave modules to the bus in combination with a flexible fine-grained module placement and with minimized latency and area overheads. A test system will demonstrate a transfer rate of 800 MB/s while providing an extreme high placement flexibility. © 2008 IEEE.}, author = {Haubelt, Christian and Koch, Dirk and Teich, Jürgen}, booktitle = {Proceedings 16th Annual IEEE Symposium on Field-Programmable Custom Computing Machines (FCCM 2008)}, doi = {10.1109/FCCM.2008.33}, faupublication = {yes}, isbn = {9780769533070}, pages = {287-290}, peerreviewed = {unknown}, title = {{Efficient} reconfigurable on-chip buses for fpgas}, venue = {Palo Alto, California}, year = {2008} } @inproceedings{faucris.117435164, author = {Falk, Joachim and Haubelt, Christian and Teich, Jürgen}, booktitle = {Proceedings FDL'06, Forum on Design Languages 2006}, date = {3006-09-29/2006-09-22}, faupublication = {yes}, pages = {129 - 134}, peerreviewed = {unknown}, title = {{Efficient} {Representation} and {Simulation} of {Model}-{Based} {Designs} in {SystemC}}, venue = {Darmstadt}, year = {2006} } @inproceedings{faucris.109522424, abstract = {Nowadays many design space exploration tools are based on Multi-Objective Evolutionary Algorithms (MOEAs). Beside the advantages of MOEAs, there is one important drawback as MOEAs might fail in design spaces containing only a few feasible solutions or as they are often afflicted with premature convergence, i.e., the same design points are revisited again and again. Exact methods, especially Pseudo Boolean solvers (PB solvers) seem to be a solution. However, as typical design spaces are multi-objective, there is a need for multi-objective PB solvers. In this paper, we will formalize the problem of design space exploration as multi-objective 0-1 ILP. We will propose (1) a heuristic approach based on PB solvers and (2) a complete multi-objective PB solver based on a backtracking algorithm that incorporates the non-dominance relation from multi-objective optimization and is restricted to linear objective functions. First results from applying our novel multi-objective PB solver to synthetic problems will show its effectiveness in small sized design spaces as well as in large design spaces only containing a few feasible solutions. For non-linear and large problems, the proposed heuristic approach is outperforming common MOEA approaches. Finally, a real world example from the automotive area will emphasize the efficiency of the proposed algorithms. ©2008 IEEE.}, author = {Lukasiewycz, Martin and Glaß, Michael and Haubelt, Christian and Teich, Jürgen}, booktitle = {2008 Asia and South Pacific Design Automation Conference, ASP-DAC}, doi = {10.1109/ASPDAC.2008.4484040}, faupublication = {yes}, isbn = {9781424419227}, pages = {691-696}, peerreviewed = {unknown}, title = {{Efficient} symbolic multi-objective design space exploration}, venue = {Seoul}, year = {2008} } @incollection{faucris.248819626, author = {Smirnov, Fedor and Pourmohseni, Behnaz and Glaß, Michael and Teich, Jürgen}, booktitle = {Smart Cities, Green Technologies and Intelligent Transport Systems}, doi = {10.1007/978-3-030-68028-2{\_}9}, faupublication = {yes}, isbn = {978-3-030-68028-2}, pages = {173 - 199}, peerreviewed = {unknown}, publisher = {Springer}, title = {{Efficient} {Symbolic} {Routing} {Encoding} for {In}-vehicle {Network} {Optimization}}, url = {https://link.springer.com/chapter/10.1007/978-3-030-68028-2{\_}9}, year = {2021} } @article{faucris.286872699, abstract = {This paper proposes a novel approach for the generation of memory-efficient table-based function approxima-
tion circuits for edge devices in general, and FPGAs in particular. Given a function f(x) to be approximated in
a given interval [X₀,X₀+a) and a maximum approximation error Ea, the goal is to determine a function table implementation with a minimized memory footprint, i.e., number of entries that need to be stored. Rather than state-of-the-art work performing an equidistant sampling of the given interval by so-called breakpoints and using linear interpolation between two adjacent breakpoints to determine f(x) at the maximum error bound, we propose and compare three algorithms for splitting the given interval into sub-intervals to reduce the required memory footprint drastically based on the observation that in sub-intervals of low gradient, a coarser sampling grid may be assumed while guaranteeing the maximum interpolation error bound Ea. Experiments on elementary mathematical functions show that a large fraction in memory footprint may be saved. Second, a hardware architecture implementing the sub-interval selection, breakpoint lookup and interpolation at a latency of just 9 clock cycles is introduced. Third, for each generated circuit design, BRAMs are automatically instantiated rather than synthesizing the reduced footprint function table using LUT primitives providing an additional degree of resource efficiency. The approach presented here for FPGAs can equally be applied to other circuit technologies for fast and, at the same time, memory-optimized function approximation at the edge.deliver overly pessimistic and thus suboptimal results w.r.t. exploiting time slack in order to minimize
the energy consumption. This holds true in particular in case computation times of tasks may
be workload-dependent and becoming known only at runtime or in case of conditionally executed
tasks or scenarios. This paper studies and quantitatively evaluates different classes of algorithms for
scheduling periodic applications given by task graphs (i.e., DAGs) with precedence constraints and
a global deadline on homogeneous MPSoCs purely at runtime on a per-instance base. We present
and analyze algorithms providing provably optimal results as well as approximation algorithms
with proven guarantees on the achieved energy savings. For problem instances taken from realistic
embedded system benchmarks as well as synthetic scalable problems, we provide results on the
computation time and quality of each algorithm to perform a) scheduling and b) voltage/speed
assignments for each task at runtime. In our portfolio, we distinguish as well continuous and discrete
speed (e.g., DVFS-related) assignment problems. In summary, the presented ties between theory
(algorithmic complexity and optimality) and execution time analysis deliver important insights on
the practical usability of the presented algorithms for runtime optimization of task scheduling and
speed assignment on MPSoCs.Many embedded system applications impose hard real-time, energy or safety requirements on corresponding programs typically concurrently executed on a given MPSoC target platform. Even when mutually isolating applications in space or time, the enforcement of such properties, e.g., by adjusting the number of processors allocated to a program or by scaling the voltage/frequency mode of involved processors, is a difficult problem to solve, particularly in view of typically largely varying environmental input (workload) per execution. In this paper, we formalize the related control problem using finite state machine models for the uncertain environment determining the workload, the system response (feedback), as well as the enforcer strategy. The contributions of this paper are as follows: a) Rather than trace-based simulation, the uncertain environment is modeled by a discrete-time Markov chain (DTMC) as a random process to characterize possible input sequences an application may experience. b) A number of important verification goals to analyze different enforcer FSMs are formulated in PCTL for the resulting stochastic verification problem, i.e., the likelihood of violating a timing or energy constraint, or the expected number of steps for a system to return to a given execution time corridor. c) Applying stochastic model checking, i.e., PRISM to analyze and compare enforcer FSMs in these properties, and finally d) proposing an approach for reducing the environment DTMC by partitioning equivalent environmental states (i.e., input states leading to an equal system response in each MPSoC mode) such that verification times can be reduced by orders of magnitude to just a few ms for real-world examples.}, address = {New York, NY, USA}, author = {Esper, Khalil and Wildermann, Stefan and Teich, Jürgen}, booktitle = {Proceedings of the 19th ACM-IEEE International Conference on Formal Methods and Models for System Design}, date = {2021-11-20/2021-11-22}, doi = {10.1145/3487212.3487348}, faupublication = {yes}, isbn = {9781450391276}, keywords = {probabilistic model cheking, MPSoC, PCTL, Markov chain, verification, finite state machine, runtime requirement enforcement}, pages = {21–31}, peerreviewed = {Yes}, publisher = {Association for Computing Machinery}, series = {MEMOCODE '21}, title = {{Enforcement} {FSMs} - {Specification} and {Verification} of {Non}-{Functional} {Properties} of {Program} {Executions} on {MPSoCs}}, venue = {Beijing}, year = {2021} } @article{faucris.118332324, author = {Dorsch, Rainer and Haubelt, Christian and Teich, Jürgen}, faupublication = {yes}, journal = {Design & Elektronik}, note = {UnivIS-Import:2015-04-14:Pub.2008.tech.IMMD.inform.entdec}, pages = {22-27}, peerreviewed = {unknown}, title = {{Entdecke} die {Möglichkeiten}}, year = {2008} } @inproceedings{faucris.116230664, address = {Berlin}, author = {Teich, Jürgen}, booktitle = {VDE/VDI-Gesellschaft Mikroelektronik, Mikro- und Feinwerktechnik (GMM), GMM-Fachbericht}, faupublication = {yes}, note = {UnivIS-Import:2015-04-16:Pub.2003.tech.IMMD.inform.entwur}, publisher = {VDE-Verlag}, series = {Entwurfsautomatisierung elektronischer Systeme auf Systemebene}, title = {{Entwurfsautomatisierung} elektronischer {Systeme} auf {Systemebene}}, venue = {Erlangen}, year = {2003} } @inproceedings{faucris.120707004, author = {Fickenscher, Jörg and Bouzouraa, Mohammed Essayed and Hannig, Frank and Teich, Jürgen}, booktitle = {Vehicle Intelligence}, date = {2017-12-05/2017-12-07}, faupublication = {yes}, peerreviewed = {unknown}, title = {{Environment} {Mapping} {Using} {Massively} {Parallel} {Architectures}}, venue = {München}, year = {2017} } @book{faucris.117082064, abstract = {Dynamically partially reconfigurable architectures combine high performance and flexibility. They offer a novel possibility to dynamically load and execute hardware modules, previously only known for software modules. In order to realize these promises, the following dilemmas had to be solved: the too often limited memory of reconfigurable architectures for many data-intensive applications, the restricted communication possibilities for partial hardware modules, the unflexible tool flow for partial module design, and the IO-pin dilemma, that the placement of hardware modules, with requirements for input and output signals to the periphery, was predetermined to a single position. These were physical restrictions and technical problems limiting the scope or applicability of dynamically partially reconfigurable architectures. This led us to the development of a new FPGA-based reconfigurable computer called Erlangen Slot Machine, a platform for interdisciplinary research on dynamically reconfigurable systems. It leverages many architectural constraints of existing platforms and allows a user to partially reconfigure hardware modules arranged in so-called slots. The uniqueness of this computer stems from a) a new slot-oriented hardware architecture, b) a set of novel inter-module communication techniques, and c) concepts for dynamic and partial reconfiguration management. © 2010 Springer Science+Business Media B.V.}, author = {Angermeier, Josef and Bobda, Christophe and Majer, Mateusz and Teich, Jürgen}, doi = {10.1007/978-90-481-3485-4{\_}3}, faupublication = {yes}, isbn = {9789048134847}, pages = {51-71}, peerreviewed = {unknown}, publisher = {Springer Netherlands}, title = {{Erlangen} slot machine: {An} {FPGA}-based dynamically reconfigurable computing platform}, year = {2010} } @inproceedings{faucris.234583133, abstract = {Approximate Computing is a novel design paradigm sacrificing computational accuracy for gains in other non-functional properties. We present a tool that allows to automatically analyze approximated loop programs for the resulting error and range of output values. Our tool allows to model non-uniform distributions for the input variables. We further support the novel concept of Anytime Instructions: Anytime Instructions encode the number of most-significant mantissa bits to be computed in floating point operations. They are typically used to achieve execution time and energy reductions. First experiments using the tool show promising results. A large portion of image processing applications often come with stringent requirements regarding performance, energy efficiency, and power. FPGAs have proven to be among the most suitable architectures for algorithms that can be processed in a streaming pipeline. Yet, designing imaging systems for FPGAs remains a very time consuming task. High-Level Synthesis, which has significantly improved due to recent advancements, promises to overcome this obstacle. In particular, Altera OpenCL is a handy solution for employing an FPGA in a heterogeneous system as it covers all device communication. However, to obtain efficient hardware implementations, extreme code modifications, contradicting OpenCL’s data-parallel programming paradigm, are necessary.

In this work, we explore the programming methodology that yields significantly better hardware implementations for the Altera Offline Compiler. We furthermore designed a compiler back end for a domain-specific source-to-source compiler to leverage the algorithm description to a higher level and generate highly optimized OpenCL code. Moreover, we advanced the compiler to support arbitrary bit width operations, which are fundamental to hardware designs. We evaluate our approach by discussing the resulting implementations throughout an extensive application set and comparing them with example designs, provided by Altera. In addition, as we can derive multiple implementations for completely different target platforms from the same domain-specific language source code, we present a comparison of the achieved implementations in contrast to GPU implementations.

}, author = {Özkan, Mehmet Akif and Reiche, Oliver and Hannig, Frank and Teich, Jürgen}, booktitle = {Proceedings of the 26th International Conference on Field-Programmable Logic and Applications (FPL)}, date = {2016-08-29/2016-09-02}, doi = {10.1109/FPL.2016.7577357}, faupublication = {yes}, peerreviewed = {Yes}, title = {{FPGA}-{Based} {Accelerator} {Design} from a {Domain}-{Specific} {Language}}, venue = {Lausanne}, year = {2016} } @article{faucris.122857944, author = {Ziener, Daniel and Bauer, Florian and Becher, Andreas and Dennl, Christopher and Meyer-Wegener, Klaus and Schürfeld, Ute and Teich, Jürgen and Vogt, Jörg-Stephan and Weber, Helmut}, faupublication = {yes}, journal = {ACM Transactions on Reconfigurable Technology and Systems}, peerreviewed = {unknown}, title = {{FPGA}-{Based} {Dynamically} {Reconfigurable} {SQL} {Query} {Processing}}, year = {2015} } @article{faucris.106478284, abstract = {In this article, we propose an FPGA-based SQL query processing approach exploiting the capabilities of partial dynamic reconfiguration of modern FPGAs. After the analysis of an incoming query, a query-specific hardware processing unit is generated on the fly and loaded on the FPGA for immediate query execution. For each query, a specialized hardware accelerator pipeline is composed and configured on the FPGA from a set of presynthesized hardware modules. These partially reconfigurable hardware modules are gathered in a library covering all major SQL operations like restrictions and aggregations, as well as more complex operations such as joins and sorts. Moreover, this holistic query processing approach in hardware supports different data processing strategies including row-as column-wise data processing in order to optimize data communication and processing. This article gives an overview of the proposed query processing methodology and the corresponding library of modules. Additionally, a performance analysis is introduced that is able to estimate the processing time of a query for different processing strategies and different communication and processing architecture configurations. With the help of this performance analysis, architectural bottlenecks may be exposed and future optimized architectures, besides the two prototypes presented here, may be determined.}, author = {Ziener, Daniel and Weber, Helmut and Vogt, Jörg-Stephan and Schürfeld, Ute and Meyer-Wegener, Klaus and Teich, Jürgen and Dennl, Christopher and Becher, Andreas and Bauer, Florian}, doi = {10.1145/2845087}, faupublication = {yes}, journal = {ACM Transactions on Reconfigurable Technology and Systems}, keywords = {Dynamic partial reconfiguration; FPGA; Reconfigurable computing; SQL processing}, note = {UnivIS-Import:2017-12-18:Pub.2016.tech.IMMD.inform.fpgaba{\_}0}, pages = {25:1-25:24}, peerreviewed = {unknown}, title = {{FPGA}-{Based} {Dynamically} {Reconfigurable} {SQL} {Query} {Processing}}, volume = {9}, year = {2016} } @inproceedings{faucris.121757064, abstract = {The Controller Area Network (CAN) is one of the most popular networks for industrial distributed embedded systems, particularly automotive systems. In these systems the timely transmission of data messages is a key requirement. In this paper, a physical testbed is presented that allows the evaluation of the timing behavior of CAN message transmissions. The use of FPGAs as processing nodes allows to accurately perform measurements without influencing the original system due to the inherent parallelism of programmable hardware. As a case study, the improvement of response times by using dynamic scheduling of messages is shown with the testbed. Furthermore, the testbed helps to identify time critical simulation parameters. © 2012 IEEE.}, address = {New York, NY, USA}, author = {Ziermann, Tobias and Butiu, Alexander and Teich, Jürgen and Ziener, Daniel}, booktitle = {Proc. of the 2012 International Conference on Reconfigurable Computing (ReConFig)}, date = {2012-12-05/2012-12-07}, doi = {10.1109/ReConFig.2012.6416750}, faupublication = {yes}, isbn = {978-1-4673-2919-4}, note = {UnivIS-Import:2015-04-16:Pub.2012.tech.IMMD.inform.fpgaba}, pages = {1-6}, publisher = {IEEE Press}, title = {{FPGA}-based {Testbed} for {Timing} {Behavior} {Evaluation} of the {Controller} {Area} {Network} ({CAN})}, venue = {Cancun}, year = {2012} } @inproceedings{faucris.117711044, abstract = {In this paper we introduce a new method to watermark FPGA cores where the signature (watermark) is detected at the power supply pins of the FPGA. This is the first watermarking method, where the signature is extracted in this way. We are able to sign cores at the netlist as well as the bitfile level, so a wide spectrum of cores can be protected. The power watermarking method works with all types of FPGAs, but with Xilinx FPGAs, we can integrate the watermarking algorithms and the signature into the functionality of the watermarked core. So it is very hard to remove the watermark without destroying the core. We introduce a detection algorithm which can decode the signature from a voltage trace with high probability. Additionally, a second algorithm is introduced which improves the detection probability in case of considerable noise sources. Using this algorithm, it is possible to decode the signature even if other cores operate on the same device at the same time. © 2006 IEEE.}, author = {Ziener, Daniel and Teich, Jürgen}, booktitle = {Proceedings of IEEE International Conference on Field-Programmable Technology}, date = {2006-12-13/2006-12-15}, doi = {10.1109/FPT.2006.270313}, faupublication = {yes}, note = {UnivIS-Import:2015-04-16:Pub.2006.tech.IMMD.inform.fpgaco}, pages = {205-212}, title = {{FPGA} {Core} {Watermarking} {Based} on {Power} {Signature} {Analysis}}, venue = {Bangkok}, year = {2006} } @inproceedings{faucris.117086024, abstract = {For many applications from the areas of cryptography and coding, finite field multiplication is the most resource and time consuming operation. We have designed and optimized four high performance parallel GF(2) multipliers for an FPGA realization and analyzed the time and area complexities. One of the multipliers uses a new hybrid structure to implement the Karatsuba algorithm. For increasing performance, we make excessive use of pipelining and efficient control techniques and use a modern state-of-the-art FPGA technology. As a result we have, to our knowledge, the first hardware realization of subquadratic arithmetic and currently the fastest and most efficient implementation of 233 bit finite field multipliers.}, author = {Grabbe, Cornelia and Bednara, Marcus and Teich, Jürgen and von zur Gathen, J. and J. Shokrollahi, J.}, booktitle = {Proceedings of the IEEE International Symposium on Circuits and Systems (ISCAS-2003)}, date = {2003-05-25/2003-05-28}, faupublication = {yes}, pages = {268-271}, peerreviewed = {unknown}, title = {{FPGA} designs of parallel high performance {GF}(2^233) multipliers}, url = {https://www.scopus.com/inward/record.url?partnerID=HzOxMe3b&scp=0038790049&origin=inward}, venue = {Bangkok}, volume = {2}, year = {2003} } @inproceedings{faucris.123836944, abstract = {Invasive computing is a novel paradigm for exploitation of run-time parallelism of future MPSoC architectures through resource-aware programming and dynamic reconfiguration of the underlying architectures. Based on the state and availability of resources, an invasive algorithm organizes its computation itself. This paper presents a general methodology for mapping invasive algorithms to FPGA-based dynamically reconfigurable architectures. A detailed description of a general invasive architecture on a reconfigurable platform is given. For 1D linear processor architectures, the applicability of this concept is tested and results show substantial flexibility gains with only marginal additional hardware cost. © 2009 IEEE.}, author = {Arifin, Farhadur and Amouri, Abdulazim and Hannig, Frank and Teich, Jürgen}, booktitle = {Proceedings of the IEEE International Conference on Field Programmable Technology}, date = {2009-12-09/2009-12-11}, doi = {10.1109/FPT.2009.5377633}, faupublication = {yes}, isbn = {978-1-4244-4376-5}, note = {UnivIS-Import:2015-04-16:Pub.2009.tech.IMMD.inform.fpgaim}, pages = {135-142}, title = {{FPGA} {Implementation} of an {Invasive} {Computing} {Architecture}}, venue = {Sydney}, year = {2009} } @inproceedings{faucris.121065604, abstract = {In the last decade, there has been a dramatic growth in research and development of massively parallel many-core architectures like graphics hardware, both in academia and industry. This changed also the way programs are written in order to leverage the processing power of a multitude of cores on the same hardware. In the beginning, programmers had to use special graphics programming interfaces to express general purpose computations on graphics hardware. Today, several frameworks exist to relieve the programmer from such tasks. In this paper, we present five frameworks for parallelization on GPU Accelerators, namely RapidMind, PGI Accelerator, HMPP Workbench, OpenCL, and CUDA. To evaluate these frameworks, a real world application from medical imaging is investigated, the 2D/3D image registration. © 2011 IEEE.}, author = {Membarth, Richard and Hannig, Frank and Teich, Jürgen and Körner, Mario and Eckert, Wieland}, booktitle = {Proceedings of the 9th IEEE Symposium on Application Specific Processors (SASP)}, date = {2011-06-05/2011-06-06}, doi = {10.1109/SASP.2011.5941083}, faupublication = {yes}, isbn = {978-1-4577-1211-1}, note = {UnivIS-Import:2015-04-16:Pub.2011.tech.IMMD.inform.framew}, pages = {78-81}, peerreviewed = {Yes}, title = {{Frameworks} for {GPU} {Accelerators}: {A} {Comprehensive} {Evaluation} using {2D}/{3D} {Image} {Registration}}, venue = {San Diego, CA, USA}, year = {2011} } @inproceedings{faucris.121958584, abstract = {The development of standard processors changed in the last years moving from bigger, more complex, and faster cores to putting several more simple cores onto one chip. This changed also the way programs are written in order to leverage the processing power of multiple cores of the same processor. In the beginning, programmers had to divide and distribute the work by hand to the available cores and to manage threads in order to use more than one core. Today, several frameworks exist to relieve the programmer from such tasks. In this paper, we present five such frameworks for parallelization on shared memory multi-core architectures, namely OpenMP, Cilk++, Threading Building Blocks, RapidMind, and OpenCL. To evaluate these frameworks, a real world application from medical imaging is investigated, the 2D/3D image registration. In an empirical study, a fine-grained data parallel and a coarse-grained task parallel parallelization approach are used to evaluate and estimate different aspects like usability, performance, and overhead of each framework. © 2011 Springer-Verlag.}, address = {Heidelberg}, author = {Membarth, Richard and Hannig, Frank and Teich, Jürgen and Körner, Mario and Eckert, Wieland}, booktitle = {Proceedings of the 24th International Conference on Architecture of Computing Systems (ARCS)}, date = {2011-02-24/2011-02-25}, doi = {10.1007/978-3-642-19137-4{\_}6}, faupublication = {yes}, isbn = {9783642191367}, keywords = {2D/3D Image Registration; Cilk++; Evaluation; Frameworks; Medical Imaging; OpenCL; OpenMP; Parallelization; RapidMind; Threading Building Blocks}, note = {UnivIS-Import:2015-07-08:Pub.2011.tech.IMMD.inform.framew{\_}3}, pages = {62-73}, publisher = {Springer-verlag}, series = {Lecture Notes in Computer Science (LNCS)}, title = {{Frameworks} for {Multi}-core {Architectures}: {A} {Comprehensive} {Evaluation} using {2D}/{3D} {Image} {Registration}}, venue = {Lake Como}, volume = {6566}, year = {2011} } @inproceedings{faucris.206432314, abstract = {Optimizing data-intensive applications such as image processing for GPU targets with complex memory hierarchies requires to explore the tradeoffs among locality, parallelism, and computation. Loop fusion as one of the classical optimization techniques has been proven effective to improve locality at the function level. Algorithms in image processing are increasing their complexities and generally consist of many kernels in a pipeline. The inter-kernel communications are intensive and exhibit another opportunity for locality improvement at the system level. The scope of this paper is an optimization technique called kernel fusion for data locality improvement. We present a formal description of the problem by defining an objective function for locality optimization. By transforming the fusion problem to a graph partitioning problem, we propose a solution based on the minimum cut technique to search fusible kernels recursively. In addition, we develop an analytic model to quantitatively estimate potential locality improvement by incorporating domain-specific knowledge and architecture details. The proposed technique is implemented in an image processing DSL and source-to-source compiler called Hipacc, and evaluated over six image processing applications on three Nvidia GPUs. A geometric mean speedup of up to 2.52 can be observed in our experiments. Not only in the field of high-performance computering, field-programmable gate arrays (FPGAs) are a soaringly popular accelerator technology. However, they increase the heterogeneity of clusters, which might be equipped already today with accelerators, such as GPUs. This results in having to combine expertise from different fields, e.\,g., mathematical, algorithmic and technical experts are needed to create numerical solvers for such systems. To bridge this programmability gap, domain-specific languages are a popular choice to generate low-level implementations from an abstract algorithm description. In this work, we demonstrate the generation of implementations of numerical solvers based on the multigrid method for FPGAs from the same codebase that is also used to generate code for CPUs using a hybrid parallelization of MPI and OpenMP. Our approach yields in a hardware design that can compute up to 12 V-cycles per second with an input grid size of 4096x4096 on a mid-range FPGA, beating vectorized, single-threaded execution on an Intel i7 by a factor of almost three.

}, author = {Schmitt, Christian and Schmid, Moritz and Hannig, Frank and Teich, Jürgen and Kuckuk, Sebastian and Köstler, Harald}, booktitle = {Proceedings of the 2nd International Workshop on High-Performance Stencil Computations (HiStencils)}, date = {2015-01-20/2015-01-20}, faupublication = {yes}, month = {Jan}, pages = {9-15}, peerreviewed = {Yes}, title = {{Generation} of {Multigrid}-based {Numerical} {Solvers} for {FPGA} {Accelerators}}, url = {https://www12.cs.fau.de/downloads/schmittch/publications/SSHTKK15histencils.pdf}, venue = {Amsterdam}, year = {2015} } @inproceedings{faucris.272605803, author = {Brand, Peter and Hackenberg, Benjamin and Falk, Joachim and Teich, Jürgen}, booktitle = {International Wireless Communications and Mobile Computing Conference (IWCMC 2022)}, doi = {10.1109/iwcmc55113.2022.9824349}, faupublication = {yes}, peerreviewed = {unknown}, title = {{Grant} {Prediction}-based {Dynamic} {Power} {Management} for {5G} to {Reduce} {Mobile} {Device} {Energy} {Consumption}}, venue = {Dubrovnik}, year = {2022} } @inproceedings{faucris.320671014, abstract = {Many battery-powered IoT sensor nodes rely on harvesting energy which must be assumed an unreliable source.
Previous works have shown that a sensor node can adapt its power consumption to keep its battery’s state of charge at a sufficient level to achieve perpetual operation by, e.g., dynamically adapting its duty cycle.
In this paper, we show that it is also possible to reduce the energy consumed by the operation of a sensor node by controlling the quality of the processed and transmitted data.
Moreover, whereas most state-of-the-art methods rely on forecasting energy harvesting, risking loss of service in case of wrong predictions, this paper presents an algorithm called Guaranteed Remaining Energy Scheduling (GRES) which dynamically controls the quality of processed and transmitted data of a sensor node at runtime based on the state of charge of the battery, and providing a guarantee of safe continuous operation at the expense of data quality despite fluctuations in expected harvested energy. In experiments, GRES is evaluated and compared to an approach computing ILP-generated quality schedules for one full day ahead based on the assumption of a perfect harvested energy prediction.
It is shown that the latter approach is not only computationally and energy-wise expensive,
but also can lead to power shutdowns in case of wrongly predicted
harvesting profiles.
}, author = {Heidorn, Christian and Walter, Dominik and Candir, Yunus Emre and Hannig, Frank and Teich, Jürgen}, booktitle = {Proceedings of the 31st International Conference on Field Programmable Logic and Applications (FPL)}, date = {2021-08-30/2021-09-03}, doi = {10.1109/FPL53798.2021.00079}, faupublication = {yes}, isbn = {978-1-6654-3759-2}, keywords = {Convolutional Neural Networks, Hardware Acceleration, Coarse-Grained Reconfigurable Array}, pages = {388}, peerreviewed = {Yes}, publisher = {IEEE}, title = {{Hand} {Sign} {Recognition} via {Deep} {Learning} on {Tightly} {Coupled} {Processor} {Arrays}}, venue = {Virtual Conference}, year = {2021} } @article{faucris.209405859, abstract = {Real-time applications are increasingly targeting many-core platforms, demanding predictability in a highly dynamic environment. To enable this shift, for each application, a set of mapping candidates with diverse resource requirements and performance qualities (latency, energy, etc.) may be computed at design time, and subsequently, exploited at run time to launch the application on a mapping that adheres to the on-line quality and resource constraints. These constraints, however, may also change during execution such that the mapping in use fails to satisfy them, necessitating a switch to another mapping. This process, namely, mapping reconfiguration, involves the migration of several tasks and may harm timing predictability if the reconfiguration overhead is not accounted for. This paper presents a deterministic mapping reconfiguration methodology to enable predictable reconfigurations among a given set of mappings. To this end, first in an off-line analysis, we (a) identify low-latency migration routes with minimal allocation overhead for each pair of source/target mappings and (b) bound the worst-case reconfiguration latency using an off-line timing analysis. This information is then used at run time to perform timely reconfigurations. We further investigate a (c) hybrid timing analysis which regards the actual availability of communication resources at run time to derive tighter latency bounds. Experimental results for a variety of applications show that the proposed methodology enables reconfigurations with low allocation overhead and affordable latency. To demonstrates the practicality of the proposed methodology and the advantages of the hybrid latency analysis over its off-line counterpart, we present a case study on thermal management of many-core systems using mapping reconfiguration.the error rate of the CNN. Experimental results show that our approach can further speed up inference time by 1.24× and 1.09× for VGG-16 on the CIFAR-10 dataset and ResNet-101 on the ILSVRC-2012 dataset, respectively, compared to the state-of-the-art ABCPruner. Field Programmable Gate Arrays (FPGAs) excel at the implementation of local operators in terms of throughput per energy since the off-chip communication can be reduced with an application-specific on-chip memory configuration. Furthermore, data-level parallelism can efficiently be exploited through so-called loop coarsening, which processes multiple horizontal pixels simultaneously. Moreover, existing solutions for proper border handling in hardware show considerable resource overheads.

In this paper, we first propose novel architectures for image border handling and loop coarsening, which can significantly reduce area. Second, we present a systematic analysis of these architectures including the formulation of analytical models for their area usage. Based on these models, we provide an algorithm for suggesting the most efficient hardware architecture for a given specification. Finally, we evaluate several implementations of our proposed architectures obtained through Vivado High-Level Synthesis (HLS). The synthesis results show that the proposed coarsening architecture uses 32% less registers for a 5-by-5 convolution with a 64 coarsening factor compared to previous works, whereas the proposed border handling architectures facilitate a decrease in the Look-up Table (LUT) usage by 36%.

}, author = {Özkan, Mehmet Akif and Reiche, Oliver and Hannig, Frank and Teich, Jürgen}, booktitle = {28th Annual IEEE International Conference on Application-specific Systems, Architectures and Processors (ASAP)}, date = {2017-07-10/2017-07-12}, doi = {10.1109/ASAP.2017.7995273}, faupublication = {yes}, isbn = {978-1-5090-4825-0}, keywords = {FPGA; High Level Synthesis; Image Processing; Loop Coarsening; Border Handling}, pages = {155-163}, peerreviewed = {Yes}, title = {{Hardware} {Design} and {Analysis} of {Efficient} {Loop} {Coarsening} and {Border} {Handling} for {Image} {Processing}}, url = {https://www12.cs.fau.de/downloads/oezkan/publications/asap17.pdf}, venue = {Seattle}, year = {2017} } @inproceedings{faucris.241087248, abstract = {In this paper, we present the efficient hardware implementation of hyperbolic tangent activation function, which is most widely used in artificial neural networks for accelerating machine learning applications. The proposed design considers the floating point representation of numbers for the first time, the nonlinear nature of the activation function while sampling, and uses a lookup table for implementation. The unique way of dividing the input range into bins which follows the binary pattern reduces the hardware implementation cost. Furthermore, the input data itself is used as the address for lookup table; thus, no extra cost involved in hashing the lookup table and involves only one memory access time resulting in faster and efficient hardware implementation. Our design proves to be 3x faster when compared to similar hardware implementations using CMOS 90nm proces}, author = {Arvind, T. K. R. and Brand, Marcel and Heidorn, Christian and Boppu, Srinivas and Hannig, Frank and Teich, Jürgen}, booktitle = {Proceedings of the 24th International Symposium on VLSI Design and Test (VDAT)}, date = {2020-07-23/2020-07-25}, doi = {10.1109/VDAT50263.2020.9190305}, faupublication = {yes}, isbn = {978-1-7281-9369-4}, peerreviewed = {Yes}, publisher = {IEEE}, title = {{Hardware} {Implementation} of {Hyperbolic} {Tangent} {Activation} {Function} for {Floating} {Point} {Formats}}, venue = {Bhubaneswar}, year = {2020} } @article{faucris.115421064, author = {Teich, Jürgen}, faupublication = {no}, journal = {Bulletin SEV/VSE}, note = {UnivIS-Import:2015-03-05:Pub.1996.tech.IMMD.inform.hardwa}, pages = {17-23}, peerreviewed = {No}, title = {{Hardware}/{Software}-{Codesign}: {Massgeschneiderte} elektronische {Systeme}.{Teil} {I}: {HW}/{SW}-{Architekturen} und {Spezifikation}}, year = {1996} } @article{faucris.115471664, author = {Teich, Jürgen}, faupublication = {no}, journal = {Bulletin SEV/VSE}, note = {UnivIS-Import:2015-03-05:Pub.1997.tech.IMMD.inform.hardwa}, pages = {17-22}, peerreviewed = {No}, title = {{Hardware}/{Software}-{Codesign}: {Massgeschneiderte} elektronische {Systeme}. {Teil} {II}: {HW}/{SW}-{Synthese}}, year = {1997} } @inproceedings{faucris.118987704, author = {Ernst, Rolf and Richter, Kai and Teich, Jürgen and Thiele, Lothar and Ziegenbein, Dirk}, booktitle = {Proc. Int. Workshop on VLSI}, faupublication = {no}, note = {UnivIS-Import:2015-04-16:Pub.1999.tech.IMMD.inform.hardwa}, pages = {9-17}, title = {{Hardware}/{Software} {Codesign} of {Embedded} {Systems} - {The} {SPI} {Workbench}}, venue = {Orlando, Florida}, year = {1999} } @article{faucris.117546924, author = {Teich, Jürgen}, doi = {10.1109/JPROC.2011.2182009}, faupublication = {yes}, journal = {Proceedings of the IEEE}, note = {UnivIS-Import:2015-03-09:Pub.2012.tech.IMMD.inform.hardwa}, pages = {1411-1430}, peerreviewed = {Yes}, title = {{Hardware}/{Software} {Co}-{Design}: {Past}, {Present}, and {Predicting} the {Future}}, volume = {100}, year = {2012} } @inproceedings{faucris.118706104, abstract = {Managing future many-core architectures with hundreds of cores, running multiple applications in parallel, is very challenging. One of the major reasons is the communication overhead required to handle such a large system. Distributed management is proposed to reduce this overhead. The architecture is divided into regions which are managed separately. The instance managing the region and the applications running within the regions need to collect data for various reasons from time to time, e.g., to collect data for proper mapping decision, to synchronize tasks or to aggregate computation results. In this work, we propose and investigate different strategies for adaptive data collection in meshed Networks on Chip. The mechanisms can be used to collect data within regions, which are defined during run-time in respect of size and position. The mechanisms are investigated while considering delay, NoC utilization and implementation costs. The results show that the selection of the used mechanism depends on the requirements. Synthesis results compare area overhead, timing impact and energy consumption. © 2013 IEEE.}, address = {Red Hook, NY, USA}, author = {Heisswolf, Jan and Weichslgartner, Andreas and Zaib, Aurang and König, Ralf and Wild, Thomas and Herkersdorf, Andreas and Teich, Jürgen and Becker, Jürgen}, booktitle = {Proc. IEEE 27th International Parallel and Distributed Processing Symposium Workshops PhD Forum}, date = {2013-05-20/2013-05-24}, doi = {10.1109/IPDPSW.2013.124}, faupublication = {yes}, keywords = {1000 cores; Adaptive data collection; aggregation; Hamilton cycle; Networks on Chip; region-based management}, note = {UnivIS-Import:2015-04-16:Pub.2013.tech.IMMD.inform.hardwa}, pages = {153-162}, publisher = {Curran Associate}, title = {{Hardware} {Supported} {Adaptive} {Data} {Collection} for {Networks} on {Chip}}, venue = {Boston, Massachusetts}, year = {2013} } @inproceedings{faucris.117993084, author = {Bednara, Marcus and Beyer, O. and Teich, Jürgen and Wanka, Rolf}, booktitle = {Workshop on System Design Automation - SDA 2000}, faupublication = {no}, note = {UnivIS-Import:2015-04-16:Pub.2000.tech.IMMD.inform.hardwa}, pages = {37-44}, title = {{Hardware}-{Supported} {Sorting}: {Design} and {Tradeoff} {Analysis}}, venue = {Rathen}, year = {2000} } @incollection{faucris.118939304, author = {Bednara, Marcus and Beyer, O. and Teich, Jürgen and Wanka, Rolf}, booktitle = {In System Design Automation}, faupublication = {no}, note = {UnivIS-Import:2015-04-20:Pub.2001.tech.IMMD.inform.hardwa}, pages = {97-107}, peerreviewed = {unknown}, publisher = {Kluwer Academic Publishers}, title = {{Hardware} {Supported} {Sorting}: {Design} and {Tradeoff} {Analysis}}, year = {2001} } @inproceedings{faucris.118197904, abstract = {When using dynamically and partially reconfigurable FPGAs in embedded systems, the scheduler needs to fulfill area and time requirements for each task. While those demands are already well studied in literature, another characteristic peculiarity of reconfigurable systems has been rather neglected: the reconfiguration overhead. However, scheduling algorithms considering the exclusive access to the reconfiguration port can improve the latency of obtained schedules considerably. In this paper, we present new scheduling heuristics and a methodology to compare approaches which take into consideration the reconfiguration overheads with those which disregard them. Furthermore, our experimental results give insight into possible performance increases and present problem instances for which the reconfiguration latency is negligible. ©2008 IEEE.}, address = {New York}, author = {Angermeier, Josef and Teich, Jürgen}, booktitle = {Proceedings 15th Reconfigurable Architectures Workshop}, date = {2008-04-14/2008-04-18}, doi = {10.1109/IPDPS.2008.4536540}, faupublication = {yes}, note = {UnivIS-Import:2015-04-16:Pub.2008.tech.IMMD.inform.heuris}, pages = {1-8}, publisher = {IEEE Press}, title = {{Heuristics} for {Scheduling} {Reconfigurable} {Devices} with {Consideration} of {Reconfiguration} {Overheads}}, venue = {Miami, Florida}, year = {2008} } @incollection{faucris.123176504, address = {Boston, Dordrecht, London}, author = {Haubelt, Christian and Mostaghim, Sanaz and Slomka, Frank and Teich, Jürgen and Tyagi, Ambrish}, booktitle = {Evolutionary Algorithms in System Design}, editor = {Drechsler, R. and Drechsler, N}, faupublication = {no}, pages = {63-104}, peerreviewed = {unknown}, publisher = {Kluwer Academic Publishers}, series = {Genetic Algorithms and Evolutionary Computation (GENA)}, title = {{Hierachical} {Synthesis} of {Embedded} {Systems} {Using} {Evolutionary} {Algorithms}}, year = {2003} } @inproceedings{faucris.117945124, author = {Arzt, Ulrich and Teich, Jürgen and Schumacher, M and Thiele, Lothar}, booktitle = {Proc. CompEuro 1992}, date = {1992-05-04/1992-05-08}, doi = {10.1109/CMPEUR.1992.218504}, faupublication = {no}, note = {UnivIS-Import:2015-04-16:Pub.1992.tech.IMMD.inform.hierar}, pages = {232-237}, title = {{Hierarchical} concepts in the design of processor arrays}, venue = {The Hague, The Netherlands,}, year = {1992} } @inproceedings{faucris.121657624, author = {Anlauff, Matthias and Fischer, Dirk and Kutter, Philipp and Teich, Jürgen and Weper, Ralph}, booktitle = {Proc. EUROCAST 2001}, date = {2001-02-19/2001-02-23}, faupublication = {no}, note = {UnivIS-Import:2015-04-16:Pub.2001.tech.IMMD.inform.hierar{\_}6}, pages = {271-274}, title = {{Hierarchical} {Microprocessor} {Design} {Using} {XASM}}, venue = {Las Palmas de Gran Canaria}, year = {2001} } @inproceedings{faucris.115861064, author = {Fischer, Dirk and Teich, Jürgen and Weper, Ralph}, booktitle = {International Workshop on Software and Compilers for Embedded Systems}, date = {2001-03-20/2001-03-22}, faupublication = {no}, note = {UnivIS-Import:2015-04-16:Pub.2001.tech.IMMD.inform.hierar}, pages = {???}, title = {{Hierarchical} {Modeling} and {Simulation} of {Embedded} {Processors} {Using} {ASMs}}, venue = {St.Goar}, year = {2001} } @inproceedings{faucris.117717204, abstract = {Processor arrays are used as accelerators for plenty of data flow-dominant applications. The explosive growth in research and development of massively parallel processor array architectures has lead to demand for mapping tools to realize the full potential of these architectures. Such architectures are characterized by hierarchies of parallelism and memory structures, i.e. processor array apart from different levels of cache arrays have a number of processing elements (PE) where each PE can further contain sub-word parallelism. In order to handle large scale problems, balance local memory requirements with I/O-bandwidth, and use different hierarchies of parallelism and memory, one needs a sophisticated transformation called hierarchical partitioning. In this paper, we introduce for the first time a detailed methodology encompassing hierarchical partitioning. © 2006 IEEE.}, author = {Dutta, Hritam and Hannig, Frank and Teich, Jürgen}, booktitle = {Proceedings of the 5th International Conference on Parallel Computing in Electrical Engineering}, date = {2006-09-13/2006-09-17}, doi = {10.1109/PARELEC.2006.43}, faupublication = {yes}, isbn = {978-0-7695-2554-9}, note = {UnivIS-Import:2015-04-16:Pub.2006.tech.IMMD.inform.hierar}, pages = {153-160}, title = {{Hierarchical} {Partitioning} for {Piecewise} {Linear} {Algorithms}}, venue = {Bialystok}, year = {2006} } @article{faucris.122609784, abstract = {We present a self-adaptive hierarchical power management technique for massively parallel processor architectures, supporting a new resource-aware parallel computing paradigm called invasive computing. Here, an application can dynamically claim, execute, and release the resources in three phases: resource acquisition (invade), program loading/configuration and execution (infect), and release (retreat). Resource invasion is governed by dedicated decentralized hardware controllers, called invasion controllers (iCtrls), which are integrated into each processing element (PE). Several invasion strategies for claiming linearly connected or rectangular regions of processing resources are implemented. The key idea is to exploit the decentralized resource management inherent to invasive computing for power savings by enabling applications themselves to control the power for processing resources and invasion controllers using a hierarchical power-gating approach. We propose analytical models for estimating various components of energy consumption for faster design space exploration and compare them with the results obtained from a cycle-accurate C++ simulator of the processor array. In order to find optimal design trade-offs, various parameters like (a) energy consumption, (b) hardware cost, and (c) timing overheads are compared for different sizes of power domains. Experimental results show significant energy savings (up to 73%) for selected characteristical algorithms and different resource utilizations. In addition, we demonstrate the accuracy of our proposed analytical model. Here, estimation errors less than 3.6% can be reported. © 2012 ACM.}, author = {Lari, Vahid and Muddasani, Shravan and Boppu, Srinivas and Hannig, Frank and Schmid, Moritz and Teich, Jürgen}, doi = {10.1145/2390191.2390193}, faupublication = {yes}, journal = {ACM Transactions on Design Automation of Electronic Systems}, keywords = {Adaptive power optimization; Parallel computing; Resource awareness; Runtime resource management; Timing overhead minimization}, note = {UnivIS-Import:2015-03-09:Pub.2012.tech.IMMD.inform.hierar}, pages = {1-25}, peerreviewed = {Yes}, title = {{Hierarchical} {Power} {Management} for {Adaptive} {Tightly}-{Coupled} {Processor} {Arrays}}, volume = {18}, year = {2012} } @article{faucris.122845184, author = {Xu, Yang and Teich, Jürgen}, faupublication = {yes}, journal = {ACM Transactions on Design Automation of Electronic Systems}, peerreviewed = {Yes}, title = {{Hierarchical} {Statistical} {Leakage} {Analysis} and its {Application}}, year = {2016} } @inproceedings{faucris.118007824, author = {Fekete, Sandor P. and Köhler, Ekkehard and Teich, Jürgen}, booktitle = {Proc. 7th Workshop on Algorithms and Data Structures, Lecture Notes in Computer Science (LNCS), Vol. 2125}, faupublication = {no}, note = {UnivIS-Import:2015-04-16:Pub.2001.tech.IMMD.inform.higher}, pages = {300-312}, title = {{Higher}-{Dimensional} {Packing} with {Order} {Constraints}}, year = {2001} } @article{faucris.116611484, author = {Teich, Jürgen and Fekete, Sandor P. and Köhler, Ekkehard}, faupublication = {yes}, journal = {SIAM Journal on Discrete Mathematics}, note = {UnivIS-Import:2015-03-09:Pub.2006.tech.IMMD.inform.higher}, pages = {1056-1078}, peerreviewed = {Yes}, title = {{Higher}-dimensional packing with order constraints}, year = {2006} } @inproceedings{faucris.110950224, abstract = {In this paper, we present a holistic design methodology for em-
bedded smart camera networks consisting of FPGA-based Pro-
grammable System-on-Chips (PSoCs). Beginning with a high-level
model of a distributed smart camera application specified in Mat-
lab/Simulink, a behavioral data-flow-based SystemC model is auto-
matically generated. In turn, every vertex of the data-flow graph rep-
resents a functional task of the application and can be implemented
either as a software or a hardware component on PSoC nodes within
the smart camera network. Also, such hardware/software designs
for data-intensive camera applications require a sophisticated hard-
ware/software interconnect. These design decisions on mapping
tasks and communication influence throughput but also the costs
of the distributed smart camera network.
Our methodology varies communication routing and task map-
ping within a camera network to optimize several objectives si-
multaneously. Particularly, we use Design Space Exploration (DSE)
to explore the trade-off between resource costs and throughput
in a smart camera application realized as a heterogeneous hard-
ware/software system. In this context, High-Level Synthesis (HLS)
constitutes an important mechanism to estimate the functional
properties of different hardware design candidates automatically.
Furthermore, it enables a semi-automatic synthesis of the smart
camera implementation candidate. As a case study, we apply our
methodology to a high-bandwidth multi-camera system consist-
ing of PSoC nodes, which apply feature-based image registration
techniques to combine multiple images with an overlapping field-
of-view to produce a panoramic vie}, author = {Streit, Franz-Josef and Letras, Martin and Schid, Matthias and Falk, Joachim and Wildermann, Stefan and Teich, Jürgen}, booktitle = {ACM Proceedings of the 11th International Conference on Distributed Smart Cameras}, date = {2017-09-05/2017-09-07}, doi = {10.1145/3131885.3131932}, faupublication = {yes}, keywords = {High-Level Synthesis; Camera Network; Design Space Exploration}, peerreviewed = {unknown}, publisher = {Association for Computing Machinery}, title = {{High}-{Level} {Synthesis} for {Hardware}/{Software} {Co}-{Design} of {Distributed} {Smart} {Camera} {Systems}}, venue = {Stanford, USA}, year = {2017} } @incollection{faucris.106130244, abstract = {The continuous progress in semiconductor technology allows for more and more complex processor architectures. The downside of these technological advances is that computing has already hit a power wall and clock frequencies can barely be increased. In order to scale computing performance in the future, systems' energy efficiency and the degree of parallelism have to be significantly improved. The design of heterogeneous hardware with different specialized resources seems to be a promising solution. When highest performance (throughput, short latencies) and energy efficiency are important, as a remedy, we consider the generation of dedicated FPGA accelerators to address these stringent requirements. In this work, we present the PARO high-level synthesis framework for the automated generation of massively parallel FPGA accelerators. The framework is tailored for compute-intensive applications from the domains of image, video, and other digital signal processing, as well as algorithms from linear algebra. Unique features of PARO include: (1) The design entry in form of a compact and intuitive domain-specific language that is closely related to a mathematical problem description, (2) support for integer, fixed point, floating point, and custom arithmetic, (3) advanced loop transformations (e.g., partitioning) and scheduling techniques in the polyhedron model, (4)generation of accelerator IP cores (VHDL code) that can be easily integrated into a system design such as an SoC or in a networked scenario. Finally, we showcase the capabilities of our framework for the development of a range image conditioning pipeline for smart cameras for range sensing. © 2014 The authors and IOS Press.}, address = {Amsterdam, The Netherlands}, author = {Schmid, Moritz and Hannig, Frank and Tanase, Alexandru-Petru and Teich, Jürgen}, booktitle = {Parallel Computing: Accelerating Computational Science and Engineering (CSE)}, doi = {10.3233/978-1-61499-381-0-497}, faupublication = {yes}, isbn = {978-1-61499-380-3}, keywords = {Accelerators; Domain-specific language; FPGAs; High-level synthesis; Polyhedron model}, note = {UnivIS-Import:2015-04-20:Pub.2014.tech.IMMD.inform.highle}, pages = {497-506}, peerreviewed = {unknown}, publisher = {IOS Press}, series = {Advances in Parallel Computing}, title = {{High}-{Level} {Synthesis} {Revised} - {Generation} of {FPGA} {Accelerators} from a {Domain}-{Specific} {Language} using the {Polyhedron} {Model}}, volume = {25}, year = {2014} } @inproceedings{faucris.121047784, abstract = {Current multi- and many-core computer architectures heavily use NoC communication in order to meet the increased bandwidth demands between the processors and for reasons of scalability. For the proper analysis of concurrency, utilization, and workload distribution of parallel multi-media applications running on such NoC-based architectures, high-speed simulation techniques are required. Apart from accurate timing simulation of compute resources, it is of utmost importance also to accurately model the delays caused by the packet-based network communication in order to reliably verify performance numbers, or to identify any bottlenecks of the underlying architecture, or to study workload distribution techniques or routing algorithms. In this paper, we present a novel simulation approach for NoCs that allows to simulate such communication delays equally accurate but much faster in average than on a flit-by-flit basis. We propose novel algorithmic and analytical techniques that predict the transmission intervals dynamically based on the arrival of communication requests, actual congestion in the NoC, routing information, packet lengths, and other parameters. According to such predictions, the simulation time may in many cases be automatically advanced, thus reducing the number of events to process in the simulator to a large extent. The presented NoC simulation technique has been integrated into a system-level multi-core architecture simulator. Experiments in running parallel real-world and multi-media applications on a simulated scalable NoC architecture show that we are able to achieve speedups of three orders of magnitude compared to cycle-accurate NoC simulators, while preserving a timing accuracy of above 95}, author = {Roloff, Sascha and Hannig, Frank and Teich, Jürgen}, booktitle = {Proceedings of the 15th IEEE/ACM Symposium on Embedded Systems for Real-Time Multimedia (ESTIMedia)}, date = {2017-10-15/2017-10-20}, doi = {10.1145/3139315.3139320}, editor = {ACM}, faupublication = {yes}, isbn = {978-1-4503-5117-1}, keywords = {Network-on-Chip, Modeling, Simulation, Parallel Programming}, pages = {2-11}, peerreviewed = {Yes}, title = {{High} {Performance} {Network}-on-{Chip} {Simulation} by {Interval}-based {Timing} {Predictions}}, venue = {Seoul, Republic of Korea}, year = {2017} } @inproceedings{faucris.241085230, author = {Alhaddad, Samer and Förstner, Jens and Groth, Stefan and Grünewald, Daniel and Grynko, Yevgen and Hannig, Frank and Kenter, Tobias and Pfreundt, Franz-Josef and Plessl, Christian and Schotte, Merlind and Steinke, Thomas and Teich, Jürgen and Weiser, Martin and Wende, Florian}, booktitle = {Proceedings of the 18th International Workshop on Algorithms, Models and Tools for Parallel Computing on Heterogeneous Platforms (HeteroPar) in Euro-Par 2020: Parallel Processing Workshops}, date = {2020-08-24/2020-08-24}, doi = {10.1007/978-3-030-71593-9{\_}15}, faupublication = {yes}, isbn = {978-3-030-71593-9}, peerreviewed = {Yes}, publisher = {Springer}, title = {{HighPerMeshes} -- {A} {Domain}-{Specific} {Language} for {Numerical} {Algorithms} on {Unstructured} {Grids}}, venue = {Warsaw}, year = {2021} } @inproceedings{faucris.121616044, abstract = {In this paper we present a new approach for generating high-speed optimized event-driven register transfer level (RTL) compiled simulators. The generation of the simulators is part of our BUILDABONG [7] framework, which aims at architecture and compiler co-generation for special purpose processors. The main focus of the paper is on the transformation of a given architecture's circuit into a graph and applying on it an essential graph decomposition algorithm to transform the graph into subgraphs denoting the minimal subsets of sequential elements which have to be reevaluated during each simulation cycle. As a second optimization, we present a partitioning algorithm, which introduces intermediate registers to minimize the number of evaluations of combinational nodes during a simulation cycle. The simulator's superior performance compared to an existing commercial simulator is shown. Finally, we demonstrate the pertinence of our approach by simulating a MIPS processor. © Springer-Verlag Berlin Heidelberg 2004.}, author = {Kupriyanov, Olexiy and Hannig, Frank and Teich, Jürgen}, booktitle = {Proceedings of the International Workshop on Systems, Architectures, Modeling and Simulation (SAMOS'04)}, date = {2004-07-19/2004-07-21}, faupublication = {yes}, pages = {519-529}, peerreviewed = {unknown}, title = {{High}-speed event-driven {RTL} compiled simulation}, url = {https://www.scopus.com/inward/record.url?partnerID=HzOxMe3b&scp=35048851603&origin=inward}, venue = {Samos}, volume = {3133}, year = {2004} } @article{faucris.117721604, abstract = {Domain-Specific Languages (DSLs) provide high-level and domain-specific abstractions that allow expressive and concise algorithm descriptions. Since the description in a DSL hides also the properties of the target hardware, DSLs are a promising path to target different parallel and heterogeneous hardware from the same algorithm description. In theory, the DSL description can capture all characteristics of the algorithm that are required to generate highly efficient parallel implementations. However, most frameworks do not make use of this knowledge and the performance cannot reach that of optimized library implementations. In this article, we present the HIPAcc framework, a DSL and source-to-source compiler for image processing. We show that domain knowledge can be captured in the language and that this knowledge enables us to generate tailored implementations for a given target architecture. Back ends for CUDA, OpenCL, and Renderscript allow us to target discrete Graphics Processing Units (GPUs) as well as mobile, embedded GPUs. Exploiting the captured domain knowledge, we can generate specialized algorithm variants that reach the maximal achievable performance due to the peak memory bandwidth. These implementations outperform state-of-the-art domain-specific languages and libraries significantl}, author = {Membarth, Richard and Reiche, Oliver and Hannig, Frank and Teich, Jürgen and Körner, Mario and Eckert, Wieland}, doi = {10.1109/TPDS.2015.2394802}, faupublication = {yes}, journal = {IEEE Transactions on Parallel and Distributed Systems}, keywords = {Domain-specific language; Image processing; Code generation; Source-to-source translation; GPU; CUDA; OpenCL; Renderscript}, month = {Jan}, note = {UnivIS-Import:2015-03-09:Pub.2015.tech.IMMD.inform.hipacc}, pages = {210-224}, peerreviewed = {Yes}, title = {{HIPAcc}: {A} {Domain}-{Specific} {Language} and {Compiler} for {Image} {Processing}}, volume = {27}, year = {2016} } @incollection{faucris.109597224, author = {Schmid, Moritz and Reiche, Oliver and Hannig, Frank and Teich, Jürgen}, booktitle = {FPGAs for Software Programmers}, doi = {10.1007/978-3-319-26408-0{\_}12}, editor = {Dirk Koch, Frank Hannig, and Daniel Ziener}, faupublication = {yes}, peerreviewed = {unknown}, publisher = {Springer}, title = {{HIPAcc}}, year = {2016} } @article{faucris.241895233, abstract = {Writing programs for heterogeneous platforms optimized for high performance is hard since this requires the code to be tuned at a low level with architecture-specific optimizations that are most times based on fundamentally differing programming paradigms and languages. OpenVX promises to solve this issue for computer vision applications with a royalty-free industry standard that is based on a graph-execution model. Yet, the OpenVX' algorithm space is constrained to a small set of vision functions. This hinders accelerating computations that are not included in the standard. In this paper, we analyze OpenVX vision functions to find an orthogonal set of computational abstractions. Based on these abstractions, we couple an existing domain-specific language (DSL) back end to the OpenVX environment and provide language constructs to the programmer for the definition of user-defined nodes. In this way, we enable optimizations that are not possible to detect with OpenVX graph implementations using the standard computer vision functions. These optimizations can double the throughput on an Nvidia GTX GPU and decrease the resource usage of a Xilinx Zynq FPGA by 50% for our benchmarks. Finally, we show that our proposed compiler framework, called HipaccVX, can achieve better results than the state-of-the-art approaches Nvidia VisionWorks and Halide-HLS.execution like latency or power consumption. Enforcement of such requirements can be implemented
by a reactive control loop, where an enforcer determines based on a system response (feedback) how
to control the system, e.g., by selecting the number of active cores allocated to a program or by
scaling their voltage/frequency mode. It is of a particular interest to design enforcement strategies
for which it is possible to provide formal guarantees with respect to requirement violations, especially
under a largely varying environmental input (workload) per execution. In this paper, we consider
enforcement strategies that are modeled by a finite state machine (FSM) and the environment by
a discrete-time Markov chain. Such a formalization enables the formal verification of temporal
properties (verification goals) regarding the satisfaction of requirements of a given enforcement
strategy.
In this paper, we propose history-based enforcement FSMs which compute a reaction not just on
the current, but on a fixed history of K previously observed system responses. We then analyze the
quality of such enforcement FSMs in terms of the probability of satisfying a given set of verification
goals and compare them to enforcement FSMs that react solely on the current system response.
As experimental results, we present three use cases while considering requirements on latency and
power consumption. The results show that history-based enforcement FSMs outperform enforcement
FSMs that only consider the current system response regarding the probability of satisfying a given
set of verification goals.However, the static nature of such optimized accelerators limits the amount of operators they can implement to a fixed amount configured at synthesis time.
If not all operators can be pushed down to the accelerator, the decision which operators should be pushed down has therefore a big impact on the resulting data size and overall query execution time.
Statistics are therefore used to determine which operators to push onto the hardware accelerators.
Gathering these statistics is therefore of utmost importance.
In this paper we present multiple possibilities to gather statistics within an accelerator while executing a partial query.
These statistics can be gradually improved with every execution of the accelerator be of use in future queries during the query planning phase.During execution, new I/O data is continuously fetched forth and back to memory.
This data exchange is very often performance-critical and a careful orchestration thus vital.
To satisfy the I/O demand for accelerators of loop nests, it was shown that
the individual reads and writes can be merged into larger blocks, which are subsequently transferred by a single DMA transfer.
Furthermore, the order in which such DMA transfers must be issued, was shown to be reducible to a real-time task scheduling problem to be solved at run time.
Rather than just concepts, we investigate in this paper the architecture and implementation of such a Loop I/O Controller (LION) for a class of CGRAs called TCPAs.
Based on a novel heap-based priority queue approach, the proposed controller is able to issue every 6 cycles a new DMA request to a memory bus.
Even on a simple FPGA prototype running at just 200 MHz, this allows for more than 33 million DMA requests to be issued per second.
Since the execution time of a typical DMA request is in general at least one order of magnitude longer, we can conclude that this rate is sufficient to fully utilize a memory bus.
In the evaluation of our architecture, we compare the proposed priority queue with various state-of-the-art approaches and conclude that our priority queue is not only cheaper, but also guaranteeing real-time constraints.
Furthermore, we discuss many different optimizations for the Loop I/O Controller, which can reduce the area of the proposed architecture significantly.
in hardware/software co-design. Therefore, to keep up with this trend, an automated systematic design flow that directly incorporates model simulation with High-Level Synthesis (HLS) for hybrid hardware and software implementations is necessary. In order to address this issue, the work at hand makes use of the modeling and simulation environment MATLAB/Simulink, a de facto standard in model-based development. Additionally, we present a novel design flow, which enables the assignment of functional tasks of a Simulink application to individual hardware/software solutions for Xilinx Zynq PSoCs. Thereby, the proposed method-
ology enables control and system engineers to automatically explore different hardware and software implementation variants from a behavioral model. As a case study, we present a JPEG decoder application and investigate design objectives like resource costs and throughput to show the practicability of our approach. Future Advanced Driver Assistance Systems (ADAS) require the continuous computation of detailed maps of the vehicle’s environment. Due to the high demand of accuracy and the enormous amount of data to be fused and processed, common architectures used today, like single-core processors in automotive Electronic Control Units (ECUs), do not provide enough computing power. Here, emerging embedded multi-core architectures are appealing such as embedded Graphics Processing Units (GPUs). In this paper, we (a) identify and analyze common subalgorithms of ADAS algorithms for computing environment maps, such as interval maps, for suitability to be parallelized and run on embedded GPUs. From this analysis, (b) performance models are derived on achievable speedups with respect to sequential single-core CPU implementations. (c) As a third contribution of this paper, these performance models are validated by presenting and comparing a novel parallelized interval map GPU implementation against a parallel occupancy grid map implementation. For both types of environment maps, implementations on an Nvidia Tegra K1 prototype are compared to verify the correctness of the introduced performance models. Finally, the achievable speedups with respect to a single-core CPU solution are reported. These range from 3x to 275x for interval and grid map computations.}, author = {Fickenscher, Jörg and Reiche, Oliver and Schlumberger, Jens and Hannig, Frank and Teich, Jürgen}, booktitle = {Proceedings of the 18th IEEE International High-Level Design Validation and Test Workshop (HLDVT)}, date = {2016-10-07/2016-10-08}, doi = {10.1109/HLDVT.2016.7748257}, faupublication = {yes}, pages = {70-77}, peerreviewed = {Yes}, title = {{Modeling}, {Programming} and {Performance} {Analysis} of {Automotive} {Environment} {Map} {Representations} on {Embedded} {GPUs}}, venue = {Santa Cruz, CA}, year = {2016} } @inproceedings{faucris.122395284, author = {Haubelt, Christian and Teich, Jürgen and Richter, Kai and Ernst, Rolf}, booktitle = {GI / ITG / GMM Workshop - Methoden und Beschreibungssprachen zur Modellierung und Verifikation von Schaltungen und Systemen}, faupublication = {no}, pages = {163-171}, peerreviewed = {unknown}, title = {{Modellierung} {Rekonfigurierbarer} {Systemarchitekturen}}, venue = {Tuebingen}, year = {2002} } @inproceedings{faucris.108434524, author = {Ahmadinia, Ali and Bobda, Christophe and Ding, Ji and Majer, Mateusz and Teich, Jürgen}, booktitle = {IFIP VLSI-SOC 2005}, faupublication = {yes}, note = {UnivIS-Import:2015-04-16:Pub.2005.tech.IMMD.inform.modula}, pages = {103-108}, title = {{Modular} {Video} {Streaming} on a {Reconfigurable} {Platform}}, venue = {Perth}, year = {2005} } @inproceedings{faucris.122257784, author = {Witterauf, Michael and Tanase, Alexandru-Petru and Hannig, Frank and Teich, Jürgen}, booktitle = {Proceedings of the 27th IEEE International Conference on Application-specific Systems, Architectures and Processors (ASAP)}, date = {2016-07-06/2016-07-08}, faupublication = {yes}, peerreviewed = {unknown}, title = {{Modulo} {Scheduling} of {Symbolically} {Tiled} {Loops} for {Tightly} {Coupled} {Processor} {Arrays}}, venue = {London}, year = {2016} } @inproceedings{faucris.121574024, author = {Frauenheim, Th. and Hoffmann, M. and König, P.H. and Mostaghim, Sanaz and Teich, Jürgen}, booktitle = {Proceedings of the Congress on Evolutionary Computation (CEC '04)}, date = {2004-06-20/2004-06-23}, faupublication = {yes}, pages = {212-219}, peerreviewed = {unknown}, title = {{Molecular} {Force} {Field} {Parameterization} using {Multi}-{Objective} {Evolutionary} {Algorithms}}, venue = {Portland}, year = {2004} } @inproceedings{faucris.281304369, abstract = {Deep neural networks (DNNs) are computationally intensive, making them difficult to deploy on resource-constrained embedded systems. Model compression is a set of techniques that removes redundancies from a neural network with affordable degradation in task performance. Most compression methods do not target hardware-based objectives such as latency directly; however, few methods approximate latency with floating-point operations (FLOPs) or multiply-accumulate operations (MACs). Using these indirect metrics cannot directly translate to the relevant performance metric on the hardware, i.e., latency and throughput. To address this limitation, we introduce Multi-Objective Sensitivity Pruning, “MOSP,” a three-stage pipeline for filter pruning: hardware-aware sensitivity analysis, Criteria-optimal configuration selection, and pruning based on explainable AI (XAI). Our pipeline is compatible with a single or combination of target objectives such as latency, energy consumption, and accuracy. Our method first formulates the sensitivity of layers of a model against the target objectives as a classical machine learning problem. Next, we choose a Criteria-optimal configuration controlled by hyperparameters specific to each objective of choice. Finally, we apply XAI-based filter ranking to select filters to be pruned. The pipeline follows an iterative pruning methodology to recover any loss in degradation in task performance (e.g., accuracy). We allow the user to prefer one objective function over the other. Our method outperforms the selected baseline method across different neural networks and datasets in both accuracy and latency reductions and is competitive with state-of-the-art approaches.hybrid-adaptive DSE where a max-plus algebra-based analytic throughput calculation method is used in the initial DSE phase to enable a fast progress of the search space exploration. However, as this analysis may be inaccurate as neglecting some real-world effects like cache and scheduling overhead, throughput measurements are taken later in the DSE. Moreover, we explore the trade-off between scheduling efficiency of implementation candidates—in favor of reducing concurrency—and exploiting concurrency to a large extent for parallel execution of the application. To find solutions of highest achievable throughput, it is shown that not only highly scheduling efficient implementation candidates but also highly parallel implementation candidates are essential when determining the initial population. In this realm, we contribute a method for diversity-based population initialization. For a representative set of benchmarks, it is shown that the combination of the two major contributions allows us to find much higher throughput multi-core solutions within a given exploration time compared to a state-of-the-art DSE approach.Additionally, DSE can be used to explore the distribution of tasks to resources and their scheduling to obtain optimized trade-off solutions between throughput and resource costs.
However, the performance evaluation of an implementation candidate in particular via compilation and throughput measurement on the target hardware is prohibitively time-consuming.
Thus, we propose to use a max-plus algebra-based analytic throughput calculation method in the initial DSE phase where a fast evaluation with low accuracy is sufficient to guide the search through the design space.
However, this analysis neglects some real-world concerns like cache effects and scheduling overhead.
Thus, a hybrid DSE is proposed where throughput measurements are taken later in the DSE to get more accurate throughput results for real-world platforms.
Results show that our approach is able to find much higher throughput multi-core solutions within a given exploration time compared to a state-of-the-art DSE approach.of data becomes available from mobile devices tested out in the field. Exploiting this data, recent
investigations show the potential to improve the wireless modem design and performance using data-
centric approaches, e.g., power optimization with machine learning (ML) algorithms [1]. As depicted in
Figure 1, such data-centric workflows are exploratory and iterative by nature. For instance, time pattern
identification is performed by domain experts to derive assumptions on potential optimizations and these
assumptions are continuously refined during multiple iterations of data collection and exploration. In this
context, we propose 3 ideas to increase the exploration speed: (i) a methodology to minimize the data
pre-processing duration in each iteration, (ii) a novel entropy-based data interaction technique for visual
event sequence exploration and (iii) a similarity measure to perform subsequence matching in order to
identify frequent modem behaviors to be optimized. In this paper, we a) provide a netlist-based circuit analysis technique to distinguish so-called \emph{critical} configuration bits from \emph{essential} bits in order to identify configuration bits which will need also state-restoring actions after a recovered SEU and which not. Furthermore, b) an alternative classification approach using fault injection is developed in order to compare both classification techniques. Moreover, c) we will propose a floorplanning approach for reducing the effective number of scrubbed frames and d), experimental results will give evidence that our optimization methodology not only allows to detect errors earlier but also to minimize the Mean-Time-To-Repair (MTTR) of a circuit considerably.
In particular, we show that by using our approach, the MTTR for datapath-intensive circuits can be reduced by up to 48.5 \% in comparison to standard approaches.}, author = {Schmidt, Bernhard and Ziener, Daniel and Teich, Jürgen and Zöllner, Christian}, faupublication = {yes}, journal = {Integration-The Vlsi Journal}, keywords = {Single Event Upsets; FPGA Scrubbing; Configuration Bit Partitioning; Floorplanning; Fault Injection}, peerreviewed = {Yes}, title = {{Optimizing} {Scrubbing} by {Netlist} {Analysis} for {FPGA} {Configuration} {Bit} {Classification} and {Floorplanning}}, year = {2017} } @inproceedings{faucris.120375464, author = {Bhattacharyya, Shuvra S. and Teich, Jürgen and Zitzler, Eckart}, booktitle = {Proc. of CEC'2000, the Int. Conf. on Evolutionary Computation}, faupublication = {no}, note = {UnivIS-Import:2015-04-16:Pub.2000.tech.IMMD.inform.optimi}, pages = {365-372}, title = {{Optimizing} the {Efficiency} of {Parameterized} {Local} {Search} within {Global} {Search}}, venue = {La Jolla, CA}, year = {2000} } @incollection{faucris.119080764, address = {Basel}, author = {Ziermann, Tobias and Wildermann, Stefan and Teich, Jürgen}, booktitle = {Organic Computing - A Paradigm Shift for Complex Systems}, doi = {10.1007/978-3-0348-0130-0{\_}32}, faupublication = {yes}, isbn = {978-3-0348-0130-0}, note = {UnivIS-Import:2015-04-20:Pub.2011.tech.IMMD.inform.organi}, pages = {489-501}, peerreviewed = {unknown}, publisher = {Birkhäuser Verlag}, title = {{OrganicBus}: {Organic} {Self}-organising {Bus}-{Based} {Communication} {Systems}}, year = {2011} } @inproceedings{faucris.124029224, abstract = {We propose a new processor architecture called
Orthogonal Instruction Processing (OIP). Contrary to Very Long
Instruction Word (VLIW) decoding, we propose to orthogonally
decode the sub-instruction words of each Functional Unit (FU)
instead. Hereby, the OIP architecture is able to reduce the overall
machine code size of VLIW programs significantly. We will
show analytically as well as experimentally that, compared to
a VLIW processor, the savings in instruction memory size easily
compensate the overhead of one separate branch unit needed for
each FU.
For the analytical analysis, a mathematical model of hardware
costs of an OIP processor is developed and compared to a
conventional VLIW processor. In addition, we compare the code
size of selected representative programs of the new processor
architecture and show big savings of program memory. Here, the
instruction memory requirements can be decreased by a factor of
0.465. This decrease in instruction memory, despite the discussed
overhead, leads to savings in the overall hardware costs of one
processor by a factor of 0.98}, author = {Brand, Marcel and Hannig, Frank and Tanase, Alexandru-Petru and Teich, Jürgen}, booktitle = {2017 IEEE 11th International Symposium on Embedded Multicore/Many-core Systems-on-Chip}, date = {2017-09-18/2017-09-20}, doi = {10.1109/MCSoC.2017.17}, faupublication = {yes}, isbn = {978-1-5386-3441-7}, pages = {5-12}, peerreviewed = {unknown}, title = {{Orthogonal} {Instruction} {Processing}: {An} {Alternative} to {Lightweight} {VLIW} {Processors}}, venue = {Korea University, Seoul, Korea}, year = {2017} } @inproceedings{faucris.118076024, abstract = {This paper deals with the mapping of loop programs onto processor arrays either implemented in an FPGA or available as (reconfigurable) coarse-grained processor architectures. Usually the proportion of processing elements to I/O-interfaces is much higher whereby problems of data transportation and synchronization are arising. In this realm, we propose a systematic approach in order to feed-out data. Here, (a) an efficient routing strategy is presented and (b) a novel retiming strategy is given in order to ensure collision free output serialization.}, author = {Hannig, Frank and Teich, Jürgen}, booktitle = {Proceedings of the International Conference on Engineering of Reconfigurable Systems and Algorithms}, date = {2005-06-27/2005-06-30}, faupublication = {yes}, isbn = {1-932415-74-2}, keywords = {Hardware mapping; Processor arrays; Serialization}, note = {UnivIS-Import:2015-04-16:Pub.2005.tech.IMMD.inform.output}, pages = {78-84}, title = {{Output} {Serialization} for {FPGA}-based and {Coarse}-grained {Processor} {Arrays}}, venue = {Las Vegas, NV}, year = {2005} } @inproceedings{faucris.121252384, author = {Ahmadinia, Ali and Bobda, Christophe and Majer, Mateusz and Teich, Jürgen}, booktitle = {Proceedings of the 12th Reconfigurable Architectures Workshop (RAW 2005)}, date = {2005-04-04/2005-04-05}, editor = {2005}, faupublication = {yes}, pages = {154b}, peerreviewed = {unknown}, title = {{Packet} {Routing} in {Dynamically} {Changing} {Networks} on {Chip}}, venue = {Denver}, year = {2005} } @inproceedings{faucris.118306364, abstract = {State-of-the-art behavioral synthesis tools barely have high-level transformations in order to achieve highly parallelized implementations. If any, they apply loop unrolling to obtain a higher throughput. In this paper, we employ the PARO behavioral synthesis tool which has the unique ability to perform both loop unrolling or loop partitioning. Loop unrolling replicates the loop kernel and exposes the parallelism for hardware implementation, whereas partitioning tiles the loop program onto a regular array consisting of tightly coupled processing elements. The usage of the same design tool for both the variants enables for the first time, a quantitative evaluation of the two approaches for reconfigurable architectures with help of computationally intensive algorithms selected from different benchmarks. Superlinear speedups in terms of throughput are accomplished for the processor array approach. In addition, area and power cost are reduced. © 2009 Springer Berlin Heidelberg.}, author = {Hannig, Frank and Dutta, Hritam and Teich, Jürgen}, booktitle = {Proceedings of the 22nd International Conference on Architecture of Computing Systems}, date = {2009-03-10/2009-03-13}, doi = {10.1007/978-3-642-00454-4{\_}5}, faupublication = {yes}, isbn = {978-3-642-00453-7}, note = {UnivIS-Import:2015-04-16:Pub.2009.tech.IMMD.inform.parall}, pages = {16-27}, publisher = {Springer-verlag}, series = {Lecture Notes in Computer Science (LNCS)}, title = {{Parallelization} {Approaches} for {Hardware} {Accelerators} - {Loop} {Unrolling} versus {Loop} {Partitioning}}, venue = {Delft}, volume = {5455}, year = {2009} } @incollection{faucris.228838890, author = {Roloff, Sascha and Hannig, Frank and Teich, Jürgen}, booktitle = {Modeling and Simulation of Invasive Applications and Architectures}, doi = {10.1007/978-981-13-8387-8{\_}5}, editor = {Sascha Roloff, Frank Hannig, Jürgen Teich}, faupublication = {yes}, month = {Jan}, note = {CRIS-Team WoS Importer:2019-11-08}, pages = {101-128}, peerreviewed = {unknown}, series = {Computer Architecture and Design Methodologies}, title = {{Parallel} {MPSoC} {Simulation} and {Architecture} {Evaluation}}, year = {2019} } @inproceedings{faucris.109695784, author = {Xu, Yang and Wang, Bo and Teich, Jürgen}, booktitle = {Proceedings of the International Workshop on Power and Timing Modeling, Optimization and Simulation (PATMOS)}, date = {2014-09-29/2014-10-01}, doi = {10.1109/PATMOS.2014.6951860}, faupublication = {yes}, pages = {1-6}, peerreviewed = {unknown}, title = {{Parametric} {Yield} {Optimization} {Using} {Leakage}-{Yield}-{Driven} {Floorplanning}}, venue = {Palma de Mallorca}, year = {2014} } @inproceedings{faucris.119080544, author = {Teich, Jürgen}, booktitle = {Proc. First International Conference on Evolutionary Multi-Criterion Optimization, In Lecture Notes in Computer Science (LNCS), Vol. 1993}, date = {2001-03-07/2001-03-09}, faupublication = {no}, note = {UnivIS-Import:2015-04-16:Pub.2001.tech.IMMD.inform.pareto}, pages = {314-328}, title = {{Pareto}-{Front} {Exploration} with {Uncertain} {Objectives}}, venue = {Zurich}, year = {2001} } @inproceedings{faucris.118201424, abstract = {In this paper, we present the PARO design tool for the automated hardware synthesis of massively parallel embedded architectures for given dataflow dominant applications. Key features of PARO are: (1) The design entry in form of a compact and intuitive functional programming language which allows highly parallel implementations. (2) Advanced partitioning techniques are applied in order to balance the trade-offs in cost and performance along with requisite throughputs. This is obtained by distributing computations onto an array of tightly coupled processor elements. (3) We demonstrate the performance of the FPGA synthesized hardware with several selected algorithms from different benchmarks. © 2008 Springer-Verlag Berlin Heidelber}, address = {Berlin Heidelberg}, author = {Hannig, Frank and Ruckdeschel, Holger and Dutta, Hritam and Teich, Jürgen}, booktitle = {Proceedings of the Fourth International Workshop on Applied Reconfigurable Computing}, date = {2008-03-26/2008-03-28}, doi = {10.1007/978-3-540-78610-8{\_}30}, faupublication = {yes}, note = {UnivIS-Import:2015-04-16:Pub.2008.tech.IMMD.inform.parosy}, pages = {287-293}, publisher = {Springer-verlag}, series = {Lecture Notes in Computer Science (LNCS)}, title = {{PARO}: {Synthesis} of {Hardware} {Accelerators} for {Multi}-{Dimensional} {Dataflow}-{Intensive} {Applications}}, venue = {London}, year = {2008} } @inproceedings{faucris.118254884, abstract = {Run-time reconfiguration of FPGAs has been around in academia for more than two decades but it is still applied very seldom in industrial applications. This has two main reasons: a lack of killer applications that substantially benefit from run-time reconfiguration and design tools that permit to quickly implement corresponding reconfigurable systems. This tutorial gives a survey on state-of-the-art trends on reconfigurable architectures and devices, application specific requirements, and design techniques and tools that are essential for implementing partial run-time reconfiguration on FPGAs. This is followed by a demonstration of the floorplanning and constraint generation tool GoAhead. Furthermore, the tutorial will reveal several applications that benefit from partial reconfiguration, including network data processing, digital signal processing, cognitive radio, and systems on a reconfigurable chip. For these applications, the individual challenges and implementation issues are presented together with the achieved results. This tutorial demonstrates that partial FPGA reconfiguration is beneficial and applicable in industrial systems. © 2012 Gesellschaft fuer Informatk.}, address = {New York, NY, USA}, author = {Koch, Dirk and Torresen, Jim and Beckhoff, Christian and Ziener, Daniel and Dennl, Christopher and Breuer, Volker and Teich, Jürgen and Feilen, Michael and Stechele, Walter}, booktitle = {Proc. of the 25th International Conference on Architecture of Computing Systems (ARCS)}, date = {2012-02-28/2012-03-02}, faupublication = {yes}, isbn = {978-3-642-28292-8}, note = {UnivIS-Import:2015-04-16:Pub.2012.tech.IMMD.inform.partia}, pages = {297-319}, publisher = {IEEE Press}, title = {{Partial} {Reconfiguration} on {FPGAs} in {Practice} - {Tools} and {Applications}}, venue = {Munich}, year = {2012} } @article{faucris.115279604, abstract = {The paper describes the systematic design of processor arrays with a given dimension and given number of processing elements. This problem is called partitioning. A solution to the partitioning problem is described for mapping a class of algorithms with piecewise regular dependence graphs, i.e. piecewise regular algorithms, onto processor arrays. These arrays are also piecewise regular, i.e. they are composed of a number of regularly connected homogenous subarrays. Partitioning deals with the division of the dependence graph of a piecewise regular algorithm into tiles and the scheduling of corresponding operations on a processor array of fixed size and dimension. Different solutions to this problem are termed partitioning schemes. Partitioning schemes may be classified into projection, multiprojection, passive and active clustering. The hereafter presented unified approach to the solution of the partitioning problem is based on the following concepts: (1) Algorithms are represented by programs. These programs can be directly interpreted as a description of hardware. (2) The concept of stepwise refinement of programs is used to solve the partitioning problem by applying a sequence of provably correct program transformations. The transformations basically involve operations on index sets. Two program transformations are introduced: (a) The EXPAND program transformation partitions the iteration space of a given program into a direct sum of lattices. The dimension of the iteration space increases. In contrary to other approaches, also nonperfect tilings may be considered. (b) Operations are scheduled on a processor array of fixed size and dimension using the REDUCE transformation. The dimension of the iteration space and thereby the dimension of the processor array is reduced. The parameters of this program transformation enable the realization of the different partitioning schemes. (3) The whole solution is embedded in the concepts of a systematic design of processor arrays. © 1993.}, author = {Teich, Jürgen and Thiele, Lothar}, doi = {10.1016/0167-9260(93)90013-3}, faupublication = {no}, journal = {Integration-The Vlsi Journal}, keywords = {algorithm transformation; clustering; fixed size arrays; mapping of algorithms; partitioning; VLSI processor arrays}, note = {UnivIS-Import:2015-03-05:Pub.1993.tech.IMMD.inform.partit}, pages = {14(3):297-332}, peerreviewed = {Yes}, title = {{Partitioning} of processor arrays: {A} piecewise regular approach}, year = {1993} } @article{faucris.115473204, abstract = {A single integer linear programming model for optimally scheduling partitioned regular algorithms is presented. The herein presented methodology differs from existing methods in the following capabilities: 1) Not only constraints on the number of available processors and communication capabilities are taken into account, but also local memories and constraints on the size of available memories. 2) Different types of processors can be handled. 3) The size of the optimization model (number of integer variables) is independent of the size of the tiles to be executed. Hence, 4) the number of integer variables in the optimization model is greatly reduced such that problems of relevant size can be solved in practical execution time.}, author = {Teich, Jürgen and Thiele, Lothar and Zhang, L}, faupublication = {no}, journal = {Journal of VLSI Signal Processing Systems for Signal, Image, and Video Technology}, note = {UnivIS-Import:2015-03-05:Pub.1997.tech.IMMD.inform.partit}, pages = {5-20}, peerreviewed = {unknown}, title = {{Partitioning} {Processor} {Arrays} under {Resource} {Constraints}}, volume = {17}, year = {1997} } @book{faucris.204234213, address = {Basel, Schweiz}, doi = {10.3390/jlpea8040035}, edition = {8}, editor = {Fickenscher, Jörg and Schmidt, Sandra and Hannig, Frank and Bouzouraa, Mohammed Essayed and Teich, Jürgen}, faupublication = {yes}, peerreviewed = {Yes}, publisher = {Multidisciplinary Digital Publishing Institute}, series = {Special Issue: Automotive Low Power Technologies}, title = {{Path} {Planning} for {Highly} {Automated} {Driving} on {Embedded} {GPUs}}, volume = {4}, year = {2018} } @article{faucris.118066564, author = {Böke, Carsten and Ditze, C. and Hardt, Wolfram and Kleinjohann, Bernd and Rammig, Franz and Rettberg, Achim and Stroop, Joachim and Teich, Jürgen}, faupublication = {no}, journal = {EUROMICRO Journal}, note = {UnivIS-Import:2015-03-09:Pub.2000.tech.IMMD.inform.pbased}, peerreviewed = {unknown}, title = {{P}-based {System} {Design} with the {PARADISE} {Design} {Environment}}, year = {2000} } @misc{faucris.108917864, author = {Martin, Michael and Sriram, Sundararajan and Teich, Jürgen and Thiele, Lothar}, faupublication = {no}, peerreviewed = {automatic}, title = {{Performance} analysis and optimization of mixed asynchronous synchronous systems}, year = {1994} } @inproceedings{faucris.118908724, abstract = {The paper is concerned with the timing analysis of a class digital systems we call mixed asynchronous-synchronous systems. In such a system, each computation module is either synchronous (i.e. clocked) or asynchronous (i.e. selftimed). The communication between modules is assumed to be selftimed for all modules. We introduce a graph model called MASS for describing the timing behaviour of such architectures. The graph contains two kinds of nodes, synchronous and asynchronous nodes. The operation model of a MASS is similar to that of a timed marked graph, however, additional schedule constraints are imposed on synchronous nodes: A synchronous node can only fire at ticks of its local module clock. We analyze the behaviour of MASS, in particular period, periodicity and maximal throughput rate.}, author = {Martin, Michael and Sriram, Sundararajan and Teich, Jürgen and Thiele, Lothar}, booktitle = {Proc. of the IEEE Int. Workshop on VLSI Signal Processing 94}, date = {1994-10-26/1994-10-28}, faupublication = {no}, note = {UnivIS-Import:2015-04-16:Pub.1994.tech.IMMD.inform.perfor{\_}5}, pages = {103-112}, series = {IEEE VLSI Signal Processing VII}, title = {{Performance} {Analysis} of {Mixed} {Asynchronous}-{Synchronous} {Systems}}, year = {1994} } @article{faucris.120788404, author = {Martin, Michael and Sriram, Sundararajan and Teich, Jürgen and Thiele, Lothar}, faupublication = {no}, journal = {IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems}, note = {UnivIS-Import:2015-03-05:Pub.1997.tech.IMMD.inform.perfor}, pages = {473-484}, peerreviewed = {Yes}, title = {{Performance} {Analysis} of {Mixed} {Asynchronous}-{Synchronous} {Systems}}, volume = {16}, year = {1997} } @inproceedings{faucris.219351003, author = {Nisar, Abdullah and Ah Sue, Jonathan and Teich, Jürgen}, booktitle = {Proceedings of the 21st International Conference on Artificial Intelligence}, date = {2019-07-29/2019-08-01}, faupublication = {yes}, peerreviewed = {unknown}, title = {{Performance} {Comparison} between {Machine} {Learning} based {LTE} {Downlink} {Grant} {Predictors}}, venue = {Las Vegas}, year = {2019} } @inproceedings{faucris.120019724, abstract = {In order to meet demanding challenges of increasing computational requirements and stringent power constraints, there is a gradual trend towards heterogeneous multi-processor system-on-chip (MPSoC) designs integrating application specific acceleration engines. One major problem faced by the design tools for mapping of algorithms onto MPSoC architectures is the dimensioning of system components through performance analysis. In this paper, we propose a fast and accurate methodology for rate matching of statically scheduled acceleration engines using modular performance analysis. Given a set of Pareto-optimal hardware accelerator designs and an input workload behavior, the proposed methodology determines cost efficient hardware accelerators that can handle the workload. A motion JPEG case study illustrates the benefit of coupling high level synthesis tools with performance analysis. © 2009 Springer Berlin Heidelberg.}, author = {Dutta, Hritam and Hannig, Frank and Teich, Jürgen}, booktitle = {Proceedings of the 22nd International Conference on Architecture of Computing Systems}, date = {2009-03-10/2009-03-13}, doi = {10.1007/978-3-642-00454-4{\_}23}, faupublication = {yes}, isbn = {978-3-642-00453-7}, note = {UnivIS-Import:2015-04-16:Pub.2009.tech.IMMD.inform.perfor}, pages = {233-245}, publisher = {Springer-verlag}, series = {Lecture Notes in Computer Science (LNCS)}, title = {{Performance} {Matching} of {Hardware} {Acceleration} {Engines} for {Heterogeneous} {MPSoC} using {Modular} {Performance} {Analysis}}, venue = {Delft}, volume = {5455}, year = {2009} } @article{faucris.119514164, author = {Wildermann, Stefan and Angermeier, Josef and Sibirko, Eugen and Teich, Jürgen}, doi = {10.1155/2012/608312}, faupublication = {yes}, journal = {International Journal of Reconfigurable Computing}, note = {UnivIS-Import:2015-03-09:Pub.2012.tech.IMMD.inform.placin}, pages = {1-12}, peerreviewed = {unknown}, title = {{Placing} {Multi}-mode {Streaming} {Applications} on {Dynamically} {Partially} {Reconfigurable} {Architectures}}, volume = {2012}, year = {2012} } @inproceedings{faucris.119309344, abstract = {By means of partial reconfiguration, parts of the hardware can be dynamically exchanged during operation what allows to adapt the system to changing requirements, and even enables the implementation of self-managing systems. This however requires sophisticated system architectures as well as proper algorithmic runtime support. In this paper, we present an algorithm for placing streaming applications at runtime. The approach considers the heterogeneity of common FPGAs, such as Block-RAMs, as well as the routing restrictions of on-chip streaming interconnections. To reduce reconfiguration time, we extend the data flow graphs by OR-nodes to describe differing parts of applications while keeping their similarities. This allows us to model systems which only reconfigure differing parts when switching between applications. The proposed algorithm is implemented as runtime support on an FPGA-based system-on-chip. © 2010 IEEE.}, author = {Angermeier, Josef and Wildermann, Stefan and Sibirko, Eugen and Teich, Jürgen}, booktitle = {Proc. International Conference on ReConFigurable Computing and FPGAs}, date = {2010-12-13/2010-12-15}, doi = {10.1109/ReConFig.2010.52}, faupublication = {yes}, note = {UnivIS-Import:2015-04-16:Pub.2010.tech.IMMD.inform.placin}, pages = {91-96}, title = {{Placing} {Streaming} {Applications} with {Similarities} on {Dynamically} {Partially} {Reconfigurable} {Architectures}}, venue = {Cancun}, year = {2010} } @inproceedings{faucris.116472884, abstract = {In this paper we present a novel methodology for partial (re-)configuration that can be used for most bitstream configured hardware (HW). In particular low priced and not for partial reconfiguration designed devices can be used by our technique. Furthermore, the methodology is platform independent and requires neither specialized HW synthesis tools nor a documented bitstream. We manage configuration data by extracting and compressing (sub-) module data from complete bitstreams. Experiments demonstrated that this extracted bitstreams could be compressed down by orders of the primary bitstream size for some submpdules.}, author = {Koch, Dirk and Teich, Jürgen}, booktitle = {Proceedings of the 2004 ACM conference Computing Frountiers}, date = {2004-04-14/2004-04-16}, faupublication = {yes}, isbn = {1-58113-741-9}, keywords = {Bitstream Extraction; Configuration Compression; Partial Reconfiguration}, note = {UnivIS-Import:2015-04-16:Pub.2004.tech.IMMD.inform.platfo}, pages = {398 - 403}, title = {{Platform}-{Independent} {Methodology} for {Partial} {Reconfiguration}}, venue = {Ischia}, year = {2004} } @inproceedings{faucris.224386116, author = {Witterauf, Michael and Hannig, Frank and Teich, Jürgen}, booktitle = {Proceedings of the International Conference on Formal Methods and Models for System Design (MEMOCODE)}, date = {2019-10-09/2019-10-11}, faupublication = {yes}, peerreviewed = {Yes}, title = {{Polyhedral} {Fragments}: {An} {Efficient} {Representation} for {Symbolically} {Generating} {Code} for {Processor} {Arrays}}, venue = {San Diego}, year = {2019} } @inproceedings{faucris.121721864, author = {Weichslgartner, Andreas and Heißwolf, Jan and Zaib, Aurang and Wild, Thomas and Herkersdorf, Andreas and Becker, Jürgen and Teich, Jürgen}, booktitle = {Proceedings of the second International Workshop on Multi-Objective Many-Core Design (MOMAC) in conjunction with International Conference on Architecture of Computing Systems (ARCS)}, date = {2015-03-24/2015-03-24}, faupublication = {yes}, pages = {4-5}, peerreviewed = {unknown}, title = {{Position} {Paper}: {Towards} {Hardware}-{Assisted} {Decentralized} {Mapping} of {Applications} for {Heterogeneous} {NoC} {Architectures}.}, venue = {Porto}, year = {2015} } @inproceedings{faucris.109609984, author = {Weichslgartner, Andreas and Teich, Jürgen}, booktitle = {Proceedings of the third International Workshop on Multi-Objective Many-Core Design (MOMAC) in conjunction with International Conference on Architecture of Computing Systems (ARCS)}, date = {2016-04-04/2016-04-05}, faupublication = {yes}, pages = {4-5}, peerreviewed = {unknown}, title = {{Position} {Paper}: {Towards} {Redundant} {Communication} through {Hybrid} {Application} {Mapping}}, venue = {Nuremberg}, year = {2016} } @article{faucris.106475424, author = {Khdr, Heba and Pagani, Santiago and Rodrigues Sousa, Ericles and Lari, Vahid and Pathania, Anuj and Hannig, Frank and Shafique, Muhammad and Teich, Jürgen and Henkel, Jörg}, doi = {10.1109/TC.2016.2595560}, faupublication = {yes}, journal = {IEEE Transactions on Computers}, pages = {488--501}, peerreviewed = {Yes}, title = {{Power} {Density}-{Aware} {Resource} {Management} for {Heterogeneous} {Tiled} {Multicores}}, volume = {66}, year = {2017} } @inproceedings{faucris.117400184, author = {Kissler, Dmitrij and Strawetz, Andreas and Hannig, Frank and Teich, Jürgen}, booktitle = {Proceedings of the 18th International Workshop on Power and Timing Modeling, Optimization and Simulation (PATMOS)}, date = {2008-09-10/2008-09-12}, faupublication = {yes}, pages = {307–317}, peerreviewed = {unknown}, title = {{Power}-efficient {Reconfiguration} {Control} in {Coarse}-{Grained} {Dynamically} {Reconfigurable} {Architectures}}, venue = {Lisbon}, year = {2008} } @article{faucris.121047344, abstract = {Coarse-grained reconfigurable architectures deliver high performance and energy efficiency for computationally intensive applications like mobile multimedia and wireless communication. This work deals with the aspect of power-efficient dynamic reconfiguration control techniques in such architectures. Proper clock domain partitioning with custom clock gating combined with automatic clock gating resulted in a 38% total power reduction. This is more than a threefold as compared to the single clock gating techniques applied separately. One of the corresponding case study applications with 0.064 milliwatts per megahertz and 124 million operations per second per milliwatt power efficiency outperforms the major coarse-grained and general purpose embedded processor architectures by a factor of 1.7 to 28. Copyright © 2009 American Scientific Publishers All rights reserved Printed in the United States of America.}, author = {Kissler, Dmitrij and Strawetz, Andreas and Hannig, Frank and Teich, Jürgen}, doi = {10.1166/jolpe.2009.1008}, faupublication = {yes}, journal = {Journal of Low Power Electronics}, keywords = {Coarse-Grained Reconfigurable Architectures (CGRA); Hierarchical Clock Gating; Low Power Adaptive Architectures}, note = {UnivIS-Import:2015-04-14:Pub.2009.tech.IMMD.inform.powere}, pages = {96-105}, peerreviewed = {unknown}, title = {{Power}-efficient {Reconfiguration} {Control} in {Coarse}-grained {Dynamically} {Reconfigurable} {Architectures}}, volume = {5}, year = {2009} } @inproceedings{faucris.118575864, abstract = {We propose a novel data budget-based approach to dynamically control the average power consumption of Serial RapidIO endpoint controllers in FPGAs. The key concept of the approach is to not only perform clock-gating on the FPGA-internal components of the communication controller, but to disable the multi-gigabit transceivers during idle periods. The clock synchronization, inherent to serial interfaces, enables us to omit the often needed periodic link sensing, and only enable the controller according to a predefined schedule to transmit the allocated amount of data during a specific interval. Following this approach, we are able to reduce the dynamic power consumption by up to 77 % on average. © 2012 IEEE.}, address = {New York, NY, USA}, author = {Schmid, Moritz and Hannig, Frank and Teich, Jürgen}, booktitle = {Proc. of the IEEE International Field-Programmable Custom Computing Machines Symposium (FCCM)}, date = {2012-04-29/2012-05-01}, doi = {10.1109/FCCM.2012.26}, faupublication = {yes}, isbn = {978-0-7695-4699-5}, note = {UnivIS-Import:2015-04-16:Pub.2012.tech.IMMD.inform.powerm}, pages = {101-108}, publisher = {IEEE Press}, title = {{Power} {Management} {Strategies} for {Serial} {RapidIO} {Endpoints} in {FPGAs}}, venue = {Toronto}, year = {2012} } @article{faucris.111926804, abstract = {In this paper, we introduce a new method for watermarking of IP cores for FPGA architectures where the signature (watermark) is detected at the power supply pins of the FPGA. This is the first watermarking method where the signature is extracted in this way. We are able to sign IP cores at the netlist as well as the bitfile level, so a wide spectrum of cores can be protected. In principle, the proposed power watermarking method works for all kinds of FPGAs. For Xilinx FPGAs, we demonstrate in detail that we can integrate the watermarking algorithms and the signature into the functionality of the watermarked core. So it is very hard to remove the watermark without destroying the core. Furthermore, we introduce a detection algorithm which can decode the signature from a voltage trace with high reliability. Additionally, two enhanced robustness algorithms are introduced which improve the detection probability in case of considerable noise sources. Using these techniques, it is possible to decode the signature even if other cores operate on the same device at the same time. © 2007 Springer Science+Business Media, LLC.}, author = {Ziener, Daniel and Teich, Jürgen}, doi = {10.1007/s11265-007-0136-8}, faupublication = {yes}, journal = {Journal of Signal Processing Systems For Signal Image and Video Technology}, keywords = {FPGA; IP cores; IPP; Power analysis; Signature; Watermarking}, note = {UnivIS-Import:2015-03-09:Pub.2008.tech.IMMD.inform.powers}, pages = {123-136}, peerreviewed = {Yes}, title = {{Power} {Signature} {Watermarking} of {IP} {Cores} for {FPGAs}}, volume = {51}, year = {2008} } @article{faucris.276788186, abstract = {High performance and, at the same time, energy efficiency are important yet often conflicting requirements in many fields of emerging applications. Those applications range from multi-dimensional and multi-sensor digital signal processing to machine learning, such as neural network processing. Whereas conventional fixed-point and floating-point processor architectures cannot adapt to quite diverging demands related to required precision and accuracy of computations, even within a single application, e.g., in different layers of a neural network, domain-specific accelerators may be much too specific and thus rigid to cover a wide enough spectrum of applications. In this tutorial brief, we give an overview of existing processor solutions that are reconfigurable or tunable in precision or accuracy of computations. The spectrum of reviewed architectures ranges from processors with vectorizable processors over multi- and trans-precision solutions, including GPUs to any-time instruction-set processors. The latter works with a fixed precision, but the accuracy of the result of floating-point operations is encoded in the instruction word. It can thus vary from instruction to instruction. This allows realizing accuracy vs. execution time or energy tradeoffs. Subsequently, we investigate several application domains, including neural network processing, linear algebra, and approximate computing, where such emerging processor architectures can be beneficially use}, author = {Brand, Marcel and Hannig, Frank and Keszöcze, Oliver and Teich, Jürgen}, doi = {10.1109/TCSII.2022.3173753}, faupublication = {yes}, journal = {IEEE Transactions on Circuits and Systems II: Express Briefs}, keywords = {Computer architecture; Arithmetic; Circuits and systems; Neural networks; Adders; Machine learning algorithms; Linear algebra; Accuracy; Reconfigurable architectures; Convolutional neural networks}, pages = {2661 - 2666}, peerreviewed = {Yes}, title = {{Precision}- and {Accuracy}-{Reconfigurable} {Processor} {Architectures}—{An} {Overview}}, volume = {69}, year = {2022} } @article{faucris.209242123, abstract = {The fulfillment of non-functional requirements like timing or energy consumption is of utmost
importance in many embedded systems and respective applications. Especially, with the introduc-
tion of multi-core architectures, the ability to predict non-functional execution qualities becomes
more and more difficult, as multiple concurrent application programs may interfere in execution
when typcially sharing all the resources. In this paper, we advocate a novel parallel computing
paradigm called invasive computing that allows to isolate application programs on multi-core
targets. For a presented case study of a cyber-physical real-time control system, we show that
invasive computing enables composability that in fact allows to characterize and analyze each ap-
plication program statically and independent from each other. More specifically, it is shown that a
distributed object detection algorithm for controlling an inverted pendulum and implemented on
a heterogeneous invasive multi-processor SoC (MPSoC) is able to provide real-time guarantees
as well as reliability requirements on demand.
In this paper, we introduce a generic interval-matching technique which provides several degrees of freedom for fine-tuning it to the statistical deviations of waveform measurements of COs. Moreover, we introduce a novel calibration method that finds the best parameters automatically based on statistical analysis of training data. Furthermore, we investigate a technique to reduce the number of features used for the interval matching by utilizing machine-learning-based feature extraction to find the most important samples in a template.
Finally, we evaluate the state-of-the-art interval matching and our expansions during calibration and during the application on a test set. The results show, that a reliable reduction to 10% of the original template size is possible with a reduction method from literature for our example. However, the combination of our proposed methods can reliably work with only 1.5% of the original size and is less volatile than the state-of-the-art approach for reducing the number of feature}, author = {Trautmann, Jens and Patsiatzis, Nikolaos and Becher, Andreas and Wildermann, Stefan and Teich, Jürgen}, booktitle = {Proceedings of the 2022 Workshop on Attacks and Solutions in Hardware Security}, date = {2022-11-11/2022-11-11}, doi = {10.1145/3560834.3563828}, editor = {Association for Computing Machinery}, faupublication = {yes}, keywords = {Signal Processing; Interval Matching; Waveform Matching; Side-Channel Analysis}, peerreviewed = {Yes}, publisher = {ACM}, title = {{Putting} {IMT} to the {Test}: {Revisiting} and {Expanding} {Interval} {Matching} {Techniques} and their {Calibration} for {SCA}}, url = {https://dl.acm.org/doi/10.1145/3560834.3563828}, venue = {Los Angeles, CA, USA}, year = {2022} } @incollection{faucris.120771904, address = {London}, author = {Mostaghim, Sanaz and Teich, Jürgen}, booktitle = {Evolutionary Multiobjective Optimization}, editor = {Ajith Abraham and Lakhmi Jain and Robert Goldberg}, faupublication = {yes}, note = {UnivIS-Import:2015-04-20:Pub.2003.tech.IMMD.inform.quadtr}, pages = {81-104}, peerreviewed = {Yes}, publisher = {Springer}, series = {Advanced Information and Knowledge Processing}, title = {{Quad}-trees: {A} {Data} structure for storing {Pareto}-sets in {Multi}-objective {Evolutionary} {Algorithms} with {Elitism}}, year = {2005} } @inproceedings{faucris.119892784, abstract = {Current mobile devices extensively run video players that are power hungry. Further, higher power densities as a result of technology scaling results in higher on-chip temperatures. Unlike general purpose computer systems, mobile devices that run on batteries cannot afford to have expensive cooling mechanisms. Therefore, in order to satisfy thermal constraints while running power hungry applications, dynamic thermal management (DTM) techniques have been employed. For multimedia applications, the techniques primarily relied on dynamic voltage and frequency scaling (DVFS) and dynamic power management (DPM) while taking care that maximum video quality is achieved. However, no prior work has exploited frame drops to lower the inserted idle times under predetermined quality constraints. In this work, we propose a DPM framework that utilizes frame drops to dynamically insert low idle times in order to satisfy a peak temperature constraint under a given quality constraint. This also reduces the end-to-end latency. The latencies are further reduced by maintaining lightweight workload histories. For the videos used in our experiments, it was observed that a small reduction in quality of 2 dB (reduction from 32 dB to 30 dB) due to frame drops in motion videos results in a maximum latency reduction of 1.7 sec. © 2014 IEEE.}, author = {Gangadharan, Deepak and Teich, Jürgen and Chakraborty, Samarjit}, booktitle = {Proceedings of the 25th IEEE International Conference on Application-specific Systems, Architectures and Processors (ASAP)}, date = {2014-06-18/2014-06-29}, doi = {10.1109/ASAP.2014.6868670}, faupublication = {yes}, isbn = {9781479936090}, pages = {256-263}, publisher = {Institute of Electrical and Electronics Engineers Inc.}, title = {{Quality}-aware video decoding on thermally-constrained {MPSoC} platforms}, venue = {Zurich}, year = {2014} } @inproceedings{faucris.118202744, author = {Dutta, Hritam and Hannig, Frank and Ruckdeschel, Holger and Teich, Jürgen}, booktitle = {Proceedings of the 2nd HiPEAC Workshop on Reconfigurable Computing}, faupublication = {yes}, note = {UnivIS-Import:2015-04-16:Pub.2008.tech.IMMD.inform.quanti{\_}0}, pages = {73-82}, title = {{Quantitative} {Evaluation} of {Behavioral} {Synthesis} {Approaches} for {Reconfigurable} {Devices}}, venue = {Gothenburg}, year = {2008} } @inproceedings{faucris.117096584, abstract = {Signal processing algorithms as can be found in multimedia applications are often modeled by dynamic Data Flow Graphs (DFGs), especially when targeting heterogeneous multicore platforms. However, there is often a mismatch between the fine granularity of the application and the coarse granularity of the platform. Tailoring the granularity of the DFG to a given platform by employing Quasi-Static Schedules (QSSs) promises performance gains by reducing dynamic scheduling overhead and enabling optimizations targeting groups of actors instead of individual actors in isolation. Unfortunately, all approaches known from literature to compute QSSs implicitly assume DFGs with unbounded First In First Out (FIFO) channels. In contrast, mappings of DFGs to multi-core platforms must adhere to FIFO channels with limited capacities. In this paper, we present a novel FIFO channel capacity adjustment algorithm that enables QSSs to DFGs with limited channel capacities, thus, extending the scope of QSS refinements to general multi-core targets.}, author = {Falk, Joachim and Schwarzer, Tobias and Glaß, Michael and Teich, Jürgen and Zebelein, Christian and Haubelt, Christian}, booktitle = {13th IEEE Symposium on Embedded Systems for Real-Time Multimedia, ESTIMedia 2015}, doi = {10.1109/ESTIMedia.2015.7351766}, faupublication = {yes}, isbn = {9781467381642}, keywords = {Channel capacity; Dynamic scheduling; Flow graphs; Hardware; Multimedia communication; Ports (Computers); System recovery}, peerreviewed = {unknown}, publisher = {Institute of Electrical and Electronics Engineers Inc.}, title = {{Quasi}-static scheduling of data flow graphs in the presence of limited channel capacities}, year = {2015} } @inproceedings{faucris.121580844, abstract = {In this paper, the problem of automatically mapping large-grain dataflow programs onto heterogeneous hardware/software architectures is treated. Starting with a given hardware/software partition, interfaces are inserted into the specification to account for communication, in particular across hardware/software boundaries. Depending on the target architecture, the interfaces are refined according to given communication constraints (bus protocols, memory mapping, interrupts, DMA, etc.). A framework is described that uses an object-oriented approach to transform a given dataflow graph and to generate code for the actors as well as for the interfaces. The object-orientation enables an easy migration (retargeting) of typical communication primitives to other target architectures.}, author = {Eisenring, Michael and Teich, Jürgen and Thiele, Lothar}, booktitle = {Proc. of HICSS'98, the Hawai'i Int. Conf. on Syst. Sci.}, date = {1998-01-06/1998-01-09}, faupublication = {no}, note = {UnivIS-Import:2015-04-16:Pub.1998.tech.IMMD.inform.rapidp}, pages = {187-196}, publisher = {Institute of Electrical and Electronics Engineers Computer Society}, title = {{Rapid} {Prototyping} of {Dataflow} {Programs} on {Hardware}/{Software} {Architectures}}, venue = {Kona, Hawaii}, year = {1998} } @incollection{faucris.242678571, author = {Herkersdorf, Andreas and Engel, Michael and Glaß, Michael and Henkel, Jörg and Kleeberger, Veit B. and Kühn, Johannes M. and Marwedel, Peter and Mueller-Gritschneder, Daniel and Nassif, Sani R. and Rehman, Semeen and Rosenstiel, Wolfgang and Schlichtmann, Ulf and Shafique, Muhammad and Teich, Jürgen and Wehn, Norbert and Weis, Christian}, booktitle = {Dependable Embedded Systems}, doi = {10.1007/978-3-030-52017-5{\_}1}, editor = {Henkel, Jörg, Dutt, Nikil}, faupublication = {yes}, isbn = {978-3-030-52017-5}, peerreviewed = {unknown}, title = {{RAP} {Model} - {Enabling} {Cross}-{Layer} {Analysis} and {Optimization} for {System}-on-{Chip} {Resilience}}, year = {2020} } @inproceedings{faucris.286915454, abstract = {In embedded systems, applications frequently have to meet non-functional requirements regarding, e.g., real-time or energy consumption constraints, when executing on a given MPSoC target platform.
Feedback-based controllers have been proposed that react to transient environmental factors by adapting the DVFS settings or degree of parallelism following some predefined control strategy. However, it is, in general, not possible to give formal guarantees for the obtained controllers to satisfy a given set of non-functional requirements. Run-time requirement enforcement has emerged as a field of research for the enforcement of non-functional requirements at run-time, allowing to define and formally verify properties on respective control strategies specified by automata. However, techniques for the automatic generation of such controllers have not yet been established.
In this paper, we propose a technique using reinforcement learning to automatically generate verifiable feedback-based enforcers. For that, we train a control policy based on a representative input sequence at design time. The learned control strategy is then transformed into a verifiable enforcement automaton which constitutes our run-time control model that can handle unseen input data. As a case study, we apply the approach to generate controllers that are able to increase the probability of satisfying a given set of requirement verification goals compared to multiple state-of-the-art approaches, as can be verified by model checkers.Many Big Data applications include the processing of data streams on semi-structured data formats such as JSON. A disadvantage of such formats is that an application may spend a significant amount of processing time just on unselectively parsing all data. To relax this issue, the concept of raw filtering is proposed with the idea to remove data from a stream prior to the costly parsing stage. However, as accurate filtering of raw data is often only possible after the data has been parsed, raw filters are designed to be approximate in the sense of allowing false-positives in order to be implemented efficiently.

Contrary to previously proposed CPU-based raw filtering techniques that are restricted to string matching, we present FPGA-based primitives for filtering strings, numbers and also number ranges. In addition, a primitive respecting the basic structure of JSON data is proposed that can be used to further increase the accuracy of introduced raw filters.

The proposed raw filter primitives are designed to allow for their composition according to a given filter expression of a query. Thus, complex raw filters can be created for FPGAs which enable a drastical decrease in the amount of generated false-positives, particularly for IoT workload.

As there exists a trade-off between accuracy and resource consumption, we evaluate primitives as well as composed raw filters using different queries from the RiotBench benchmark. Our results show that up to 94.3% of the raw data can be filtered without producing any observed false-positives using only a few hundred LUTs.

}, author = {Hahn, Tobias and Becher, Andreas and Wildermann, Stefan and Teich, Jürgen}, booktitle = {Proceedings of the 2022 Conference & Exhibition on Design, Automation & Test in Europe}, date = {2022-03-14/2022-03-23}, doi = {10.23919/DATE54114.2022.9774696}, faupublication = {yes}, peerreviewed = {unknown}, title = {{Raw} {Filtering} of {JSON} data on {FPGAs}}, venue = {Antwerpen}, year = {2022} } @inproceedings{faucris.119854504, abstract = {Unprocessed range images acquired by some range sensing modality often show bumpy surfaces and distorted object boundaries, complicating post processing, such as 3D-registration and feature extraction. The effects are mostly caused by noise due to sensor limitations, but can be mitigated through applying image processing techniques, as for example defect pixel interpolation, bilateral temporal averaging, and edge-preserving noise filtering. In this work, we present an approach to perform preprocessing of range images on field programmable gate arrays (FPGAs) using single precision floating point arithmetic. Moreover, we present an FPGA infrastructure, including image acquisition from a host computer via PCI express. Although this work is based on range images obtained from a Microsoft Kinect range sensing camera, the presented approach is applicable to modalities for range image acquisition, in general. The proposed image processing pipeline can be run at 150MHz for VGA-resolution images and imposes a latency close to 2 ms. © 2013 IEEE.}, author = {Schmid, Moritz and Blocherer, Markus and Hannig, Frank and Teich, Jürgen}, booktitle = {Proc. International Conference on Reconfigurable Computing and FPGAs}, date = {2013-12-09/2013-12-11}, doi = {10.1109/ReConFig.2013.6732325}, faupublication = {yes}, note = {UnivIS-Import:2015-04-16:Pub.2013.tech.IMMD.inform.realti}, pages = {1-8}, publisher = {IEEE Computer Society}, title = {{Real}-{Time} {Range} {Image} {Preprocessing} on {FPGAs}}, venue = {Cancun}, year = {2013} } @inproceedings{faucris.243176667, abstract = {A fundamental problem of massively parallel ac-
celerator architectures is the management of typically small
peripheral I/O buffers that decouple the accelerator from an
external memory. Very often, these buffers cannot store the entire
input and output data of one execution and must be updated,
i.e., filled or drained, frequently. Moreover, if a processor array
performs either a read on an empty bank or a write on a
full bank, it must interrupt its execution immediately until the
corresponding data transfer between the accelerator and an
external memory has been carried out. As a consequence, the
timing predictability of the array execution might be impaired.
Therefore, a precise analysis of a schedule for all data transfers
is inevitable. But the sequence of all data transfers cannot
be stored entirely inside most accelerators. Thus, we must
determine and schedule all necessary data transfers dynamically
at runtime. In this paper, we present an approach to characterize
all necessary data transfers and to issue them in advance so
that the peripheral I/O buffers never run full or empty. Here,
it is shown first that a deadline for each data transfer can be
derived from a given loop schedule resulting in a traditional task
scheduling program. Unfortunately, however, standard real-time
scheduling techniques such as earliest deadline first (EDF) cannot
be applied here, as each data transfer must not be interrupted
and even existing non-preemptive variants of EDF are known
to be prone to timing anomalies. As a solution, we present a
strictly non-work-conserving variant of EDF together with an
efficient schedulability test for periodic loop executions. In an
experimental section, the scheduling approach is applied to a
randomly generated set of loop programs observing that our
algorithm is able to feasibly schedule 95% of the theoretically
schedulable problem instances. Altogether, we provide a fully
timing-predictable buffer management for massively parallel
processor arrays that avoids any I/O related stalls of a processor
array by construction.In this paper, we present a parallel waveform-matching architecture that is capable of performing waveform matching at the speed of fast ADCs. We implement the proposed architecture in a high-end FPGA-based digitizer and apply it to detect AES COs from the side channel of a single-board computer operating at 1 GHz. Our implementation allows for waveform matching at 10 GS/s with high accuracy, thus offering a speedup of 50x compared to the fastest state-of-the-art implementation known to u}, author = {Trautmann, Jens and Patsiatzis, Nikolaos and Becher, Andreas and Teich, Jürgen and Wildermann, Stefan}, booktitle = {IEEE Proceedings of the 32nd International Conference on Field Programmable Logic and Applications}, date = {2022-08-29/2022-09-02}, doi = {10.1109/FPL57034.2022.00025}, faupublication = {yes}, keywords = {Side-Channel Analysis;Waveform Matching;Parallel FPGA Design;Signal Processing}, peerreviewed = {Yes}, title = {{Real}-{Time} {Waveform} {Matching} with a {Digitizer} at 10 {GS}/s}, url = {https://ieeexplore.ieee.org/document/10035208}, venue = {Belfast, United Kingdom}, year = {2022} } @article{faucris.119783004, author = {Fanucci, Luca and Teich, Jürgen}, doi = {10.1109/MDAT.2016.2570223}, faupublication = {yes}, journal = {IEEE Design and Test of Computers}, pages = {114-117}, peerreviewed = {Yes}, title = {{Recap} of the 2016 {DATE} {Conference} & {Exhibition}}, volume = {33}, year = {2016} } @inproceedings{faucris.118153024, abstract = {In this paper, we present the ReCoBus-Builder tool chain that simplifies the generation of dynamically reconfigurable systems to almost a push-button process. The generated systems provide one or more resource areas that will be used by different partially reconfigurable modules at runtime. It is possible to integrate multiple partially reconfigurable modules into the same resource area at the same time and these modules can communicate via a fixed bus infrastructure or dedicated point-to-point links with other parts of the system. This allows building encapsulated modules that will be integrated into the system by linking together bitstreams at runtime. We will demonstrate that bitstream linking can further be used to speed up the design process of static only systems by eliminating long synthesis runs or place and route steps, when only small portions of a design are exchanged.© 2008 IEEE.}, address = {New York}, author = {Beckhoff, Christian and Koch, Dirk and Teich, Jürgen}, booktitle = {Proceedings of International Conference on Field-Programmable Logic and Applications}, date = {2008-09-08/2008-09-10}, doi = {10.1109/FPL.2008.4629918}, faupublication = {yes}, isbn = {978-1-4244-1960-9}, note = {UnivIS-Import:2015-04-16:Pub.2008.tech.IMMD.inform.recobu}, pages = {119-124}, publisher = {IEEE Press}, title = {{ReCoBus}-{Builder} - {A} {Novel} {Tool} and {Technique} to {Build} {Statically} and {Dynamically} {Reconfigurable} {Systems} for {FPGAs}}, venue = {Heidelberg}, year = {2008} } @inproceedings{faucris.106301184, abstract = {Recent research was mainly focused on the OS support for a single reconfigurable chip. This paper presents a general approach to manage fault tolerant distributed reconfigurable hardware. In order to run such a system, three basic tasks must be implemented: (i) rerouting to compensate line errors, (ii) rebinding to compensate node failures, and (iii) hardware reconfiguration to allow the optimization of these systems during runtime. This paper proposes first ideas and solutions of these management functions. Furthermore, a prototype implementation consisting of four fully connected FPGAs is presented.}, author = {Haubelt, Christian and Koch, Dirk and Teich, Jürgen}, booktitle = {Proceedings of the 16th Symposium on Integrated Circuits and Systems Design (SBCCI2003)}, date = {2003-09-08/2003-09-11}, doi = {10.1109/SBCCI.2003.1232851}, faupublication = {yes}, isbn = {9780769520094}, keywords = {Automotive engineering; Body area networks; Computer science; Delay; Energy consumption; Fault tolerance; Field programmable gate arrays; Hardware; Operating systems; Prototypes}, pages = {343-348}, peerreviewed = {unknown}, publisher = {Institute of Electrical and Electronics Engineers Inc.}, title = {{ReCoNet}: {Modeling} and implementation of fault tolerant distributed reconfigurable hardware}, venue = {São Paulo}, year = {2003} } @book{faucris.117102524, abstract = {Automotive, avionic or body-area networks are systems that consist of several communicating control units specialized for certain purposes. Typically, constraints regarding reliability, availability but also flexibility are imposed on these systems. In this chapter, we will present the ReCoNets approach for increasing reliability and flexibility of such systems by solving the hardware/software codesign problem online. A ReCoNet allows to migrate tasks implemented in hardware or software from one node to another. Typically, it consists of a network of communicating Field-Programmable Gate Arrays (FPGAs) and CPUs. Moreover, if a sufficient number of hardware/software resources is not available, the migration of functionality from hardware to software or vice versa is initiated by the system itself. For supporting such flexibility, new design methods as well as services integrated in a distributed operating system for networked embedded systems are revealed. Besides the formal definition of methods and concepts providing several self-x properties such as self-healing, self-adaptiveness and self-optimization, a ReCoNet demonstrator is presented hosting a driver assistance application. © 2010 Springer Science+Business Media B.V.}, author = {Haubelt, Christian and Koch, Dirk and Reimann, Felix and Streichert, Thilo and Teich, Jürgen}, doi = {10.1007/978-90-481-3485-4{\_}11}, faupublication = {yes}, isbn = {9789048134847}, pages = {223-243}, peerreviewed = {unknown}, publisher = {Springer Netherlands}, title = {{ReCoNets}-design methodology for embedded systems consisting of small networks of reconfigurable nodes and connections}, year = {2010} } @inproceedings{faucris.117258724, author = {Sousa, Éricles and Hannig, Frank and Teich, Jürgen}, booktitle = {Proceedings of the International Embedded Systems Symposium (IESS)}, date = {2015-11-03/2015-11-06}, doi = {10.1007/978-3-319-90023-0}, editor = {Marcelo Götz, Gunar Schirner, Marco Aurélio Wehrmeister, Mohammad Abdullah Al Faruque, and Achim Rettberg}, faupublication = {yes}, isbn = {978-3-319-90022-3}, pages = {1-10}, peerreviewed = {Yes}, publisher = {Springer International Publishing}, title = {{Reconfigurable} {Buffer} {Structures} for {Coarse}-{Grained} {Reconfigurable} {Arrays}}, url = {http://www.springer.com/us/book/9783319900223}, venue = {Foz do Iguaçu}, year = {2015} } @article{faucris.119973744, author = {Teich, Jürgen}, faupublication = {yes}, journal = {it - Information Technology}, note = {UnivIS-Import:2015-03-09:Pub.2007.tech.IMMD.inform.reconf}, pages = {139-142}, peerreviewed = {Yes}, title = {{Reconfigurable} {Computing} {Systems}}, volume = {49}, year = {2007} } @article{faucris.108157324, abstract = {

Not only in the field of high-performance computing, field-programmable gate arrays (FPGAs) are a soaringly popular accelerator technology. However, they use a completely different programming paradigm and tool set compared to CPUs or even GPUs, adding extra development steps and requiring special knowledge, hindering widespread use in scientific computing. To bridge this programmability gap, domain-specific languages are a popular choice to generate low-level implementations from an abstract algorithm description. In this work, we demonstrate our approach for the generation of numerical solver implementations based on the multigrid method for FPGAs from the same code base that is also used to generate code for CPUs using a hybrid parallelization of MPI and OpenMP. Our approach yields in a hardware design that can compute up to 11 V-cycles per second with an input grid size of 4096x4096 and solution on the coarsest using the conjugate gradient method on a mid-range FPGA, beating vectorized, multi-threaded execution on an Intel Xeon processor.

}, author = {Schmitt, Christian and Schmid, Moritz and Kuckuk, Sebastian and Köstler, Harald and Teich, Jürgen and Hannig, Frank}, doi = {10.1142/S0129626418500160}, faupublication = {yes}, journal = {Parallel Processing Letters}, peerreviewed = {Yes}, title = {{Reconfigurable} {Hardware} {Generation} of {Multigrid} {Solvers} with {Conjugate} {Gradient} {Coarse}-{Grid} {Solution}}, volume = {28}, year = {2018} } @inproceedings{faucris.118155004, address = {Berlin Heidelberg}, author = {Angermeier, Josef and Batzer, Ulrich and Claus, Christopher and Majer, Mateusz and Stechele, Walter and Teich, Jürgen}, booktitle = {Proceedings of the Fourth International Workshop on Applied Reconfigurable Computing}, faupublication = {yes}, note = {UnivIS-Import:2015-04-16:Pub.2008.tech.IMMD.inform.reconf}, pages = {149-159}, publisher = {Springer}, series = {Lecture Notes in Computer Science (LNCS)}, title = {{Reconfigurable} {HW}/{SW} {Architecture} of a {Reconfigurable} {HW}/{SW} {Architecture} of a {Real}-{Time} {Driver} {Assistance} {System}}, venue = {London}, year = {2008} } @inproceedings{faucris.116050264, author = {Bednara, Marcus and J. Shokrollahi, J. and Teich, Jürgen and Daldrup, M. and von zur Gathen, J.}, booktitle = {Proc. The 9th Reconfigurable Architectures Workshop}, faupublication = {no}, note = {UnivIS-Import:2015-04-16:Pub.2002.tech.IMMD.inform.reconf}, title = {{Reconfigurable} {Implementation} of {Elliptic} {Curve} {Crypto} {Algorithms}}, venue = {Fort Lauderdale, Florida}, year = {2002} } @book{faucris.122806464, abstract = {Placement and scheduling are recognized as the most important problems when exploiting the benefit of partially reconfigurable devices such as FPGAs. For example, dynamically loading and unloading modules onto an FPGA causes fragmentation, and-in turn-may decrease performance. To counteract this effect, we use methods from algorithmics and mathematical optimization to increase the performance and present algorithms for placing, scheduling, and defragmenting modules on FPGAs. Taking communication between modules into account, we further present strategies to minimize communication overhead. Finally, we consider scheduling module requests with time-varying resource demands. © 2010 Springer Science+Business Media B.V.}, author = {Ahmadinia, Ali and Angermeier, Josef and Fekete, Sandor P. and Kamphans, Tom and Koch, Dirk and Majer, Mateusz and Schweer, Nils and Teich, Jürgen and Tessars, Christopher and Van Der Veen, Jan C.}, doi = {10.1007/978-90-481-3485-4{\_}10}, faupublication = {yes}, isbn = {9789048134847}, pages = {199-221}, peerreviewed = {unknown}, publisher = {Springer Netherlands}, title = {{ReCoNodes}-optimization methods for module scheduling and placement on reconfigurable hardware devices}, year = {2010} } @inproceedings{faucris.122526404, author = {Aliee, Hananeh and Banaiyianmofrad, Abbas and Glaß, Michael and Teich, Jürgen and Dutt, Nikil}, booktitle = {Proc. Methoden und Beschreibungssprachen zur Modellierung und Verifikation von Schaltungen und Systemen (MBMV'17)}, date = {2017-02-08/2017-02-09}, faupublication = {yes}, isbn = {978-3-8440-4996-1}, pages = {1-12}, peerreviewed = {unknown}, title = {{Redundancy}-aware {Design} {Space} {Exploration} for {Memory} {Reliability} in {Many}-cores}, venue = {Bremen}, year = {2017} } @inproceedings{faucris.118054904, abstract = {Similar to programmable devices such as processors or micro controllers also reconfigurable logic devices can be built as software, by programming the configuration of the device. In this paper, we present an overview of constraints which have to be considered when mapping applications to coarse-grained reconfigurable architectures. The application areas of most of these architectures addressing computational-intensive algorithms like video and audio processing or wireless communication. Therefore, reconfigurable arrays are in direct competition with DSP processors which are traditionally used for digital signal processing. Hence, existing mapping methodologies are closely related to approaches from the DSP world. They try to employ pipelining and temporal partitioning but they do not exploit the full parallelism of a given algorithm and the computational potential of typically 2-dimensional arrays. We present a first case study for mapping regular algorithms onto reconfigurable arrays by using our design methodology which is characterized by loop parallelization in the polytope model. The case study shows that our regular mapping methodology may lead to highly efficient implementations taking the constraints of the architecture into account.}, author = {Hannig, Frank and Dutta, Hritam and Teich, Jürgen}, booktitle = {Proceedings of the 2004 IEEE International Conference on Acoustics, Speech, and Signal Processing}, date = {2004-05-17/2004-05-21}, faupublication = {yes}, isbn = {0-7803-8484-9}, note = {UnivIS-Import:2015-04-16:Pub.2004.tech.IMMD.inform.regula}, pages = {57-60}, title = {{Regular} {Mapping} for {Coarse}-grained {Reconfigurable} {Architectures}}, venue = {Montreal, Quebec}, volume = {V}, year = {2004} } @article{faucris.122848704, author = {Strehl, Karsten and Teich, Jürgen and Thiele, Lothar}, faupublication = {no}, journal = {Parallel Algorithms and Applications}, note = {UnivIS-Import:2015-03-09:Pub.2000.tech.IMMD.inform.regula}, pages = {265-300}, peerreviewed = {unknown}, title = {{Regular} {State} {Machines}}, year = {2000} } @inproceedings{faucris.122805804, author = {Teich, Jürgen and Thiele, Lothar}, booktitle = {Presented at Workshop Seminar No. 98341, Tiling for Optimal Resource Utilization}, faupublication = {no}, peerreviewed = {unknown}, title = {{Regular} {State} {Machines}}, venue = {Schloss Dagstuhl}, year = {1998} } @inproceedings{faucris.107173044, author = {Brand, Peter and Falk, Joachim and Ah Sue, Jonathan and Brendel, Johannes and Hasholzner, Ralph and Teich, Jürgen}, booktitle = {21st International Workshop on Software and Compilers for Embedded Systems (SCOPES’18)}, date = {2018-05-28/2018-05-30}, doi = {10.1145/3207719.3207722}, editor = {ACM}, faupublication = {yes}, isbn = {978-1-4503-5780-7}, keywords = {Dynamic Network Communication; LTE Radio Systems; Machine Learning; Predictability; Power Efficient Data Transmission}, pages = {18-26}, peerreviewed = {Yes}, title = {{Reinforcement} {Learning} for {Power}-{Efficient} {Grant} {Prediction} in {LTE}}, venue = {Sankt Goar}, year = {2018} } @inproceedings{faucris.122322464, abstract = {Increasing reliability is one of the most important design goals for current and future embedded systems. In this paper, we will put focus on the design phase in which reliability constitutes one of several competing design objectives. Existing approaches considered the simultaneous optimization of reliability with other objectives to be too extensive. Hence, they firstly design a system, secondly analyze the system for reliability and finally exchange critical parts or introduce redundancy in order to satisfy given reliability constraints or optimize reliability. Unfortunately, this may lead to suboptimal designs concerning other design objectives. Here, we will present a) a novel approach that considers reliability with all other design objectives simultaneously, b) an evaluation technique that is able to perform a quantitative analysis in reasonable time even for real-world applications, and c) experimental results showing the effectiveness of our approach. © 2007 EDAA.}, author = {Teich, Jürgen and et al.}, author_hint = {Glaß M., Lukasiewycz M., Streichelt T., Haubelt C., Teich J.}, booktitle = {Proceedings of Design, Automation and Test in Europe (DATE 2007)}, date = {2007-04-16/2007-04-20}, doi = {10.1109/DATE.2007.364626}, faupublication = {yes}, isbn = {9783981080124}, pages = {409-414}, peerreviewed = {unknown}, support_note = {Author relations incomplete. You may find additional data in field 'author{\_}hint'}, title = {{Reliability}-aware system synthesis}, venue = {Nice Acropolis}, year = {2007} } @inproceedings{faucris.106836444, abstract = {In this paper, we evaluate the suitability of different SRAM-based FPGAs for harsh radiation environments (e.g., space). In particular, we compare the space-grade and radiation-hardened by design Virtex-5QV (XQR5VFX130) with the commercial off-the-shelf Kintex-7 (KC7K325T) from Xilinx. The advantages of the latter device are: 2.5 times the resources of the space-grade FPGA, faster switching times, less power consumption, and the support of modern design tools. We focus on resource consumption as well as reliability in dependence of single event upset rates for a geostationary earth orbit satellite application, the Heinrich Hertz satellite mission. For this mission, we compare different modular redundancy schemes with different
voter structures for the qualification of a digital communication receiver. A major drawback of the Kintex-7 are current-step single event latchups, which are a risk for space missions. If the use of an external voter is not possible, we suggest triple modular redundancy with one single voter at the end, whereby the Virtex-5QV in this configuration is about as reliable as the Kintex-7 in an N-modular redundancy configuration with an external high-reliable voter.}, author = {Glein, Robert and Rittner, Florian and Becher, Andreas and Ziener, Daniel and Frickel, Jürgen and Teich, Jürgen and Heuberger, Albert}, booktitle = {Proceedings of 2015 NASA/ESA Conference on Adaptive Hardware and Systems}, date = {2015-06-15/2015-06-18}, editor = {IEEE}, faupublication = {yes}, keywords = {FPGA, radiation, space, TMR, NMR, Xilinx, Kintex, Virtex, single event upset, SEU, modular redundancy}, pages = {1 - 8}, title = {{Reliability} of {Space}-{Grade} vs. {COTS} {SRAM}-{Based} {FPGA} in {N}-{Modular} {Redundancy}}, venue = {Montreal}, year = {2015} } @inproceedings{faucris.123950464, author = {Becher, Andreas and Wildermann, Stefan and Mühlenthaler, Moritz and Teich, Jürgen}, booktitle = {Proceedings of the International Conference on Reconfigurable Computing and FPGAs}, doi = {10.1109/ReConFig.2016.7857185}, faupublication = {yes}, pages = {1-8}, peerreviewed = {unknown}, title = {{ReOrder}: {Runtime} {Datapath} {Generation} for {High}-{Throughput} {Multi}-{Stream} {Processing}}, venue = {Cancún}, year = {2016} } @inproceedings{faucris.122181444, abstract = {Many embedded systems are implemented with a set of alternative function variants to adapt the system to different applications or environments. This paper proposes a novel approach for the coherent representation and selection of function variants in the different phases of the design process. In this context, the modeling of reconfiguration of system parts is supported in a natural way. Using a real example from the video processing domain, the approach is explained and validated.}, author = {Ernst, Rolf and Richter, Kai and Teich, Jürgen and Thiele, Lothar and Ziegenbein, Dirk}, booktitle = {Proc. 36th Design Automation Conference}, date = {1999-06-21/1999-06-25}, faupublication = {no}, note = {UnivIS-Import:2015-04-16:Pub.1999.tech.IMMD.inform.repres}, pages = {517-522}, publisher = {IEEE}, title = {{Representation} of {Function} {Variants} for {Embedded} {System} {Optimization} and {Synthesis}}, venue = {New Orleans}, year = {1999} } @inproceedings{faucris.117972184, abstract = {The specification of embedded systems very often contains a mixture of different models of computation. In particular, the data flow and control flow associated to the transformative and reactive domains, respectively, are tightly coupled. The paper considers classes of applications that feature communicating processes whose functions depend on a finite set of computation modes. The change between these modes is synchronized by data communication. An approach is presented to model the correlation of process modes and to fully utilize this information for scheduling. A modeling example shows the optimization potential of the new approach.}, author = {Ernst, Rolf and Richter, Kai and Teich, Jürgen and Thiele, Lothar and Ziegenbein, Dirk}, booktitle = {Proc. of ICCAD - the ACM/IEEE Int. Conf. on CAD}, date = {1998-11-08/1998-11-12}, faupublication = {no}, note = {UnivIS-Import:2015-04-16:Pub.1998.tech.IMMD.inform.repres}, pages = {54-61}, publisher = {IEEE Computer Society}, title = {{Representation} of {Process} {Mode} {Correlation} for {Scheduling}}, venue = {San Jose, CA}, year = {1998} } @inproceedings{faucris.109553004, abstract = {Choosing the right programming model for multiprocessor System-on-Chip (MPSoC) platforms is a challenging task: in order to provide for automatic hardware/software synthesis in a model-based design flow, the system model should be architecture-independent on the one hand, but should also allow the back-annotation of architecture-dependent mapping and scheduling decisions on the other hand. Here, architecture-independent dataflow models are particularly suitable. In a dataflow model, concurrent processes (actors) communicate via packets transmitted over channels. However, the back-annotation process is difficult, and previous approaches are either restricted to static dataflow actors or static schedules. Also, the semantics of the underlying dataflow actors are often different from the semantics of the scheduling mechanism, which limits the compositionality of the dataflow model. In this paper, we propose a modeling approach which unifies the representation of dataflow actors and scheduling mechanisms, while also providing for both, dynamic dataflow actors and dynamic scheduling decisions. We describe how various scheduling schemes can be represented by our approach, and show the applicability of the proposed approach by means of synthetic dataflow graphs. © 2013 European Electronic Chips & Systems design Initiative - ECSI.}, author = {Zebelein, Christian and Haubelt, Christian and Falk, Joachim and Schwarzer, Tobias and Teich, Jürgen}, booktitle = {Proceedings of Forum on Specification & Design Languages (FDL 2013)}, date = {2013-09-24/2013-09-26}, faupublication = {yes}, isbn = {9782953050486}, pages = {8}, peerreviewed = {unknown}, title = {{Representing} mapping and scheduling decisions within dataflow graphs}, url = {https://www.scopus.com/inward/record.url?partnerID=HzOxMe3b&scp=84891286674&origin=inward}, venue = {Paris}, year = {2013} } @inproceedings{faucris.121427284, author = {Falk, Joachim and Haubelt, Christian and Teich, Jürgen}, booktitle = {GI/ITG/GMM-Workshop 2005}, date = {2005-04-06/2005-04-07}, faupublication = {yes}, peerreviewed = {unknown}, title = {{Representing} {Models} of {Computation} in {SystemC}}, venue = {Munich}, year = {2005} } @inproceedings{faucris.209014095, abstract = {Reconfigurable hardware such as Field-programmable Gate Arrays (FPGAs) is widely used for data processing in databases. Most of the related work focuses on accelerating one or a small set of specific operations like sort, join, regular expression matching. A drawback of such approaches is often the assumed static accelerator hardware architecture: Rather than adapting the hardware to fit the query, the query plan has to be adapted to fit the hardware. Moreover, operators or data types that are not supported by the accelerator have to be processed in software. As a remedy, approaches for exploiting the dynamic partial reconfigurability of FPGAs have been proposed that are able to adapt the datapath at runtime. However, on modern FPGAs, this introduces new challenges due to the heterogeneity of the available resources. In addition, not only the execution resources may be heterogeneous but also the memory resources.

This work focuses on the architectural aspects of database (co-)processing on heterogeneous FPGA-based PSoC (programmable System-on-Chip) architectures including processors, specialized hardware components, multiple memory types and dynamically partially reconfigurable areas. We present an approach to support such (co-)processing called ReProVide. In particular, we introduce a model to formalize the challenging task of operator placement and buffer allocation onto such heterogeneous hardware and describe the difficulties of finding good placements. Furthermore, a detailed insight into different memory types and their peculiarities is given in order to use the strength of heterogeneous memory architectures. Here, we also highlight the implications of heterogeneous memories for the problem of query placemen}, address = {Bonn}, author = {Becher, Andreas and Herrmann, Achim and Wildermann, Stefan and Teich, Jürgen}, booktitle = {Proceedings of the 1st Workshop on Novel Data Management Ideas on Heterogeneous (Co-)Processors (NoDMC)}, date = {2019-03-04/2019-03-08}, doi = {10.18420/btw2019-ws-04}, editor = {Gesellschaft für Informatik, Bonn}, faupublication = {yes}, keywords = {FPGA;Shared Memory;Query Acceleration;Near-Memory Processing}, pages = {51-70}, peerreviewed = {Yes}, publisher = {Gesellschaft für Informatik}, title = {{ReProVide}: {Towards} {Utilizing} {Heterogeneous} {Partially} {Reconfigurable} {Architectures} for {Near}-{Memory} {Data} {Processing}}, url = {https://dl.gi.de/handle/20.500.12116/21825}, venue = {Universität Rostock}, year = {2019} } @article{faucris.122956284, author = {Herkersdorf, Andreas and Aliee, Hananeh and Engel, Michael and Glaß, Michael and Gimmler-Dumont, Christina and Henkel, Jörg and Kleeberger, Veit B. and Kochte, Michael A. and Kühn, Johannes M. and Mueller-Gritschneder, Daniel and Nassif, Sani R. and Rauchfuss, Holm and Rosenstiel, Wolfgang and Schlichtmann, Ulf and Shafique, Muhammad and Tahoori, Mehdi B. and Teich, Jürgen and Wehn, Norbert and Weis, Christian and Wunderlich, Hans-Joachim}, doi = {10.1016/j.microrel.2013.12.012}, faupublication = {yes}, journal = {Microelectronics Reliability}, pages = {1066-1074}, peerreviewed = {Yes}, title = {{Resilience} {Articulation} {Point} ({RAP}): {Cross}-layer {Dependability} {Modeling} for {Nanometer} {System}-on-{Chip} {Resilience}}, volume = {54}, year = {2014} } @inproceedings{faucris.119659364, author = {Sousa, Éricles and Paul, Johny and Lari, Vahid and Hannig, Frank and Teich, Jürgen and Stechele, Walter}, booktitle = {Hardware and Software Demo at the University Booth at Design, Automation and Test in Europe (DATE)}, date = {2014-03-24/2017-12-28}, faupublication = {yes}, peerreviewed = {Yes}, title = {{Resource}-{Aware} {Computer} {Vision} {Application} on {Heterogeneous} {Multi}-{Tile} {Architecture}.}, url = {https://www.date-conference.com/system/files/file/date14/ubooth/2615.pdf}, venue = {Dresden}, year = {2014} } @article{faucris.109553444, abstract = {Multiprocessor system-on-chip (MPSoC) designs offer a lot of computational power assembled in a compact design. The computing power of MPSoCs can be further augmented by adding massively parallel processor arrays (MPPA) and specialized hardware with instruction-set extensions. On-chip MPPAs can be used to accelerate low-level image-processing algorithms with massive inherent parallelism. However, the presence of multiple processing elements (PEs) with different characteristics raises issues related to programming and application mapping, among others. The conventional approach used for programming heterogeneous MPSoCs results in a static mapping of various parts of the application to different PE types, based on the nature of the algorithm and the structure of the PEs. Yet, such a mapping scheme independent of the instantaneous load on the PEs may lead to under-utilization of some type of PEs while overloading others. In this work, we investigate the benefits of using a heterogeneous MPSoC for accelerating various stages within a real-world image-processing algorithm for object-recognition. A case study demonstrates that a resource-aware programming model called Invasive Computing helps to improve the throughput and worst observed latency of the application program, by dynamically mapping applications to different types of PEs available on a heterogeneous MPSo}, author = {Paul, Johny and Stechele, Walter and Oechslein, Benjamin and Erhardt, Christoph and Schedel, Jens and Lohmann, Daniel and Schröder-Preikschat, Wolfgang and Kröhnert, Manfred and Asfour, Tamim and Sousa, Éricles and Hannig, Frank and Lari, Vahid and Teich, Jürgen and Grudnitsky, Artjom and Bauer, Lars and Henkel, Jörg}, doi = {10.1016/j.sysarc.2015.09.002}, faupublication = {yes}, journal = {Journal of Systems Architecture}, keywords = {Computer vision; Heterogeneous processor; Image processing; Invasive Computing; MPSoC; Resource awareness}, pages = {668-680}, peerreviewed = {Yes}, title = {{Resource}-awareness on heterogeneous {MPSoCs} for image processing}, volume = {61}, year = {2015} } @inproceedings{faucris.118240804, abstract = {The efficient use of future MPSoCs with f 000 or more processor cores requires new means of resource-aware programming to deal with increasing imperfections such as process variation, fault rates, aging effects, and power as well as thermal problems. In this paper, we apply a new approach called invasive computing that enables an application programmer to spread computations to processors deliberately and on purpose at certain points of the program. Such decisions can be made depending on the degree of application parallelism and the state of the underlying resources such as utilization, load, and temperature. The introduced programming constructs for resource-aware programming are embedded into the parallel computing language X10 as developed by IBM using a library-based approach. Moreover, we show how individual heterogeneous MPSoC architectures may be modeled for subsequent functional simulation by defining compute resources such as processors themselves by lightweight threads that are executed in parallel together with the application threads by the X10 run-time system. Thus, the state changes of each hardware resource may be simulated including temperature, aging, and other useful monitor functionality to provide a first high-level programming test-bed for invasive computing. Copyright © 2011 ACM.}, address = {New York, NY, USA}, author = {Hannig, Frank and Roloff, Sascha and Snelting, Gregor and Teich, Jürgen and Zwinkau, Andreas}, booktitle = {Proceedings of the 14th International Workshop on Software and Compilers for Embedded Systems}, date = {2011-06-27/2011-06-28}, doi = {10.1145/1988932.1988941}, faupublication = {yes}, isbn = {978-1-4503-0763-5}, keywords = {MPSoC; Resource-aware programming; Simulation; X10}, note = {UnivIS-Import:2015-04-16:Pub.2011.tech.IMMD.inform.resour}, pages = {48-55}, publisher = {ACM}, series = {SCOPES'11}, title = {{Resource}-{Aware} {Programming} and {Simulation} of {MPSoC} {Architectures} through {Extension} of {X10}}, venue = {St. Goar}, year = {2011} } @inproceedings{faucris.116477724, abstract = {In this paper we present a significant extension of the quantified equation based algorithm class of piecewise regular algorithms. The main contributions of the following paper are: (1) the class of piecewise regular algorithms is extended by allowing run-time dependent conditionals, (2) a mixed integer linear program is given to derive optimal schedules of the novel class we call dynamic piecewise regular algorithms, and (3) in order to achieve highest performance, we present a speculative scheduling approach. The results are applied to an illustrative example.}, author = {Hannig, Frank and Teich, Jürgen}, booktitle = {Proceedings of the 15th IEEE International Conference on Application-specific Systems, Architectures, and Processors (ASAP 2004)}, date = {2004-09-27/2004-09-29}, faupublication = {yes}, isbn = {0-7695-2226-2}, note = {UnivIS-Import:2015-04-16:Pub.2004.tech.IMMD.inform.resour}, pages = {17 - 27}, publisher = {Institute of Electrical and Electronics Engineers}, title = {{Resource} {Constrained} and {Speculative} {Scheduling} of an {Algorithm} {Class} with {Run}-{Time} {Dependent} {Conditionals}}, venue = {Galveston, TX}, year = {2004} } @misc{faucris.110064724, author = {Hannig, Frank and Teich, Jürgen}, faupublication = {yes}, peerreviewed = {automatic}, title = {{Resource} {Constrained} and {Speculative} {Scheduling} of {Dynamic} {Piecewise} {Regular} {Algorithms}}, year = {2004} } @inproceedings{faucris.301216552, abstract = {Binary neural networks (BNNs) are a highly resource-efficient variant of neural networks. The efficiency of BNNs for tiny machine learning (TinyML) systems can be enhanced by structured pruning and making BNNs robust to faults. When used with approximate memory systems, this fault tolerance can be traded off for energy consumption, latency, or cost. For pruning, magnitude-based heuristics are not useful because the weights in a BNN can either be -1 or +1. Global pruning of BNNs has not been studied well so far. Thus, in this paper, we explore gradient-based ranking criteria for pruning BNNs and use them in combination with a sensitivity analysis. For robustness, the state-of-the-art is to train the BNNs with bit-flips in what is known as fault-aware training. We propose a method to guide fault-aware training using gradient-based explainability methods. This allows us to obtain robust and efficient BNNs for deployment on tiny devices. Experiments on audio and image processing applications show that our proposed approach outperforms the existing approaches, making it useful for obtaining efficient and robust models for a slight degradation in accuracy. This makes our approach valuable for many TinyML use cases.Here, jitter in non-functional program execution qualities is caused either by outer influences such as faults injected by the environment,
but can be induced also from the system management software itself, including thread-to-core mapping, scheduling and power management.
A second huge source of variability typically stems from data-dependent workloads.
In this paper, we classify and present techniques to enforce non-functional execution properties on multi-core programs.
Based on a static design space exploration and analysis of influences of variability of non-functional properties, enforcement strategies are generated to guide
the execution of periodically executed applications in given requirement corridors.
Using the case study of a complex image streaming application, we show that by controlling DVFS settings of cores proactively, not only tight execution times, but also reliability requirements may be enforced dynamically while trying to minimize energy consumption.
}, author = {Teich, Jürgen and Pourmohseni, Behnaz and Keszöcze, Oliver and Spieck, Jan and Wildermann, Stefan}, booktitle = {Asia and South Pacific Design Automation Conference (ASP-DAC)}, date = {2020-01-13/2020-01-16}, doi = {10.1109/ASP-DAC47756.2020.9045536}, faupublication = {yes}, keywords = {run-time enforcement, many-core systems, reliability, realtime}, month = {Jan}, pages = {629--636}, peerreviewed = {unknown}, title = {{Run}-{Time} {Enforcement} of {Non}-{Functional} {Application} {Requirements} in {Heterogeneous} {Many}-{Core} {Systems}}, venue = {China National Convention Center, Beijing, China}, year = {2020} } @incollection{faucris.227350948, author = {Teich, Jürgen and Mahmoody, Pouya and Pourmohseni, Behnaz and Roloff, Sascha and Schröder-Preikschat, Wolfgang and Wildermann, Stefan}, booktitle = {A Journey of Embedded and Cyber-Physical Systems}, doi = {10.1007/978-3-030-47487-4}, editor = {Jian-Jia Chen}, faupublication = {yes}, isbn = {978-3-030-47487-4}, peerreviewed = {unknown}, publisher = {Springer}, title = {{Run}-{Time} {Enforcement} of {Non}-functional {Program} {Properties} on {MPSoCs}}, year = {2020} } @inproceedings{faucris.118306804, author = {Wildermann, Stefan and Ziermann, Tobias and Teich, Jürgen}, booktitle = {Proc. 2009 International Conference on Field-Programmable Technology}, faupublication = {yes}, note = {UnivIS-Import:2015-04-16:Pub.2009.tech.IMMD.inform.runtim}, pages = {514-517}, title = {{Run} time {Mapping} of {AdaptiveApplications} onto {Homogeneous} {NoC}-based {Reconfigurable} {Architectures}}, venue = {Sydney}, year = {2009} } @misc{faucris.108156004, author = {Teich, Jürgen}, faupublication = {yes}, peerreviewed = {automatic}, title = {{Run}-{Time} {Monitoring} and {Enforcement} of {Non}-functional {Program} {Properties} of {Invasive} {Programs}: {Terms} and {Definitions}}, url = {https://www12.informatik.uni-erlangen.de/publications/pub2017/reportTeich17.pdf}, year = {2017} } @inproceedings{faucris.122814164, abstract = {This paper describes a runtime reconfigurable bus arbitration technique for concurrent applications on heterogeneous MPSoC architectures. Here, a hardware/software approach is introduced as part of a runtime framework that enables selecting and adapting different policies (i. e., fixed-priority, TDMA, and Round-Robin) such that the performance goals of concurrent applications can be satisfied. To evaluate the hardware cost, we compare our proposed solution with respect to a well-known SPARC V8 architecture supporting fixed-priority arbitration. Notably, even providing the flexibility for selecting up to three different policies, our reconfigurable arbiter needs only 25% and 7% more LUTs and slices registers, respectively. The reconfiguration overhead for changing between different policies is 56 cycles and for programming new time slots, only 28 cycles are necessary. For demonstrating the benefits of this reconfiguration framework, we setup a mixed hard/soft real-time scenario by considering four applications with different timeliness requirements. The experimental results show that by reconfiguring the arbiter, less processing elements can be used for achieving a specific target frame rate. Moreover, adjusting the time slots for TDMA, we can speedup a soft real-time algorithm while still satisfying the deadline for hard real-time applications.}, author = {Sousa, Éricles and Gangadharan, Deepak and Hannig, Frank and Teich, Jürgen}, booktitle = {Proceedings of the EUROMICRO Digital System Design Conference (DSD)}, date = {2014-08-27/2014-08-29}, doi = {10.1109/DSD.2014.105}, faupublication = {yes}, isbn = {9781479957934}, keywords = {Concurrent Applications; Heterogeneous MPSoC Architectures; Reconfigurable Bus Arbitration; Runtime}, pages = {74-81}, peerreviewed = {unknown}, publisher = {Institute of Electrical and Electronics Engineers Inc.}, title = {{Runtime} reconfigurable bus arbitration for concurrent applications on heterogeneous {MPSoC} architectures}, venue = {Verona}, year = {2014} } @inproceedings{faucris.203611213, abstract = {Loop bounds are often unknown until run time, making it difficult to analyze non-functional properties such as latency at compile-time.
Similarly, static allocations of processing resources to loop computations might be too conservative with respect to given performance requirements, or not optimal with respect to the energy consumption.

To still satisfy requirements when accelerating loop nests under this uncertainty of loop bounds, we formalize and propose an approach to run-time requirement enforcement:
at run time, select a mapping among a set of candidates that satisfies a given set of requirements while optimizing secondary objectives.
Because the candidate search space of suitable mappings might be prohibitively large to evaluate at run time, we further introduce two approaches to reduce its cardinality:
1) architecture-specific reduction by solving for parts of the mapping from the requirements, and
2) design-time reduction by finding a k-subset of mappings that maximizes the number of loop bounds where the requirements are satisfied.

We implemented our proposed run-time requirement enforcement techniques for a representative class of programmable processor array architecture called Tightly Coupled Procesor Array and demonstrate their effectiveness with a case study.
The case study shows the effectiveness of our approach: We can satisfy given latency requirements while easily saving up to 10% in energy.

and execution behavior cannot be optimally executed by a single

mapping of application tasks to a heterogeneous multi-core target

architecture. Albeit mapping a task to a resource with high

computational power may be suitable for input triggering a

high workload of this task, it may be more efficient to map

another task to the resource in case of input providing low

workload for the former and high workload for the latter. As

a remedy, we propose to group inputs evoking similar workload

and execution characteristics into so-called workload scenarios for

which specialized mappings targeted at the common workload

distribution in the scenario are provided. Optimized mappings

for each scenario can be determined by a scenario-based design

space exploration at design time.

At run time, applications process a stream of input data whose

scenario affiliation is a priori unknown. This entails two coupled

tasks: First, we have to identify the scenario of the current input

data based on its execution characteristics. Second, we have to

choose an application mapping for processing the current input

prior to its execution on the basis of the concluded scenarios

of the past input and the currently active scenario-associated

mapping. Note that switching between scenarios may come at

a non-negligible reconfiguration cost that could decrease the

advantage gained by a more suitable mapping.

Both tasks are tackled by a proposed run-time reconfiguration

manager, which is built on machine learning models. These

models learn a strategy for identifying scenarios and selecting

adequate mappings by design-time training. Here, different

machine learning models are compared for their applicability.

An evaluation of the run-time manager based on a ray tracing

and stitching application shows significant latency improvements

compared to an approach with a single mapping optimized for

the average-case input.

}, author = {Spieck, Jan and Wildermann, Stefan and Teich, Jürgen}, booktitle = {Post-workshop proceedings of 2019 ACM/IEEE 1st Workshop on Machine Learning for CAD}, date = {2019-09-03/2019-09-04}, doi = {10.1109/MLCAD48534.2019.9142060}, faupublication = {yes}, keywords = {scenario-based dse; heterogeneous architecture; data-driven mapping; run-time manager; machine learning model}, peerreviewed = {Yes}, title = {{Run}-{Time} {Scenario}-{Based} {MPSoC} {Mapping} {Reconfiguration} {Using} {Machine} {Learning} {Models}}, venue = {Canmore, Alberta, Canada}, year = {2020} } @inproceedings{faucris.118492704, abstract = {Ever shrinking device structures result in an increased susceptibility of modern embedded systems to radiation and temperature-dependent aging effects. This work introduces a runtime placement algorithm for dynamically reconfigurable systems that have to meet varying safety requirements. The algorithm first allocates replicas of modules to cope with soft-errors and meet the safety-level of the module and then places the modules onto the FPGA in such a way that the stress, and therefore aging, is minimized. For the replica allocation, a lifetime analysis is employed to predict the reliability of a module depending on its sensitive configuration bits and the expected runtime of the module. Moreover, the temperature profile of each active module is utilized to predict the degradation of each part of the reconfigurable area. The presented algorithm then equally distributes active modules to minimize the degradation effects while respecting placement constraints that arise from the need for majority voting between the different replicas of a module. A case study gives evidence of the capability of the proposed online placing algorithm to harden a system against radiation effects and meet safety constraints while extending the overall lifetime of the reconfigurable device by minimizing stress. © 2011 IEEE.}, address = {New York, NY, USA}, author = {Angermeier, Josef and Ziener, Daniel and Glaß, Michael and Teich, Jürgen}, booktitle = {Proceedings of the International Conference on Field-Programmable Technology}, date = {2011-12-12/2011-12-14}, doi = {10.1109/FPT.2011.6133247}, faupublication = {yes}, isbn = {978-1-4577-1741-3}, note = {UnivIS-Import:2015-04-16:Pub.2011.tech.IMMD.inform.runtim}, publisher = {IEEE Press}, title = {{Runtime} {Stress}-{Aware} {Replica} {Placement} on {Reconfigurable} {Devices} under {Safety} {Constraints}}, venue = {New Delhi}, year = {2011} } @inproceedings{faucris.109555204, abstract = {SAT-based verification of electronic systems has become very popular in recent years. In this paper, we show that SAT-techniques are also applicable and helpful during the synthesis and the optimization of a system. Therefore, we must consider two questions: (i) how to represent specifications; and (ii) how to quantify properties of embedded systems by boolean formulas. Thus, we reduce the well known binding problem to the Boolean satisfiability problem. Next, we show how to quantify the degree of fault tolerance of a system using quantified Boolean formulas (QBFs). These problems correspond to typical subroutines often used during design space exploration. We show by experiment that problem instances of reasonable size are easily solved by the QBF solver QSOLVE. © 2003 IEEE.}, author = {Haubelt, Christian and Teich, Jürgen and Feldmann, Rainer and Monien, Burkhard}, booktitle = {Proceedings of Design, Automation and Test in Europe (DATE 2003)}, date = {2003-03-03/2003-03-07}, doi = {10.1109/DATE.2003.1253784}, faupublication = {yes}, pages = {1168-1169}, peerreviewed = {unknown}, title = {{SAT}-based techniques in system synthesis}, venue = {Munich}, year = {2003} } @inproceedings{faucris.117107804, abstract = {For complex optimization problems, several population-based heuristics like Multi-Objective Evolutionary Algorithms have been developed. These algorithms are aiming to deliver sufficiently good solutions in an acceptable time. However, for discrete problems that are restricted by several constraints it is mostly a hard problem to even And a single feasible solution. In these cases, the optimization heuristics typically perform poorly as they mainly focus on searching feasible solutions rather than optimizing the objectives. In this paper, we propose a novel methodology to obtain feasible solutions from constrained discrete problems in population-based optimization heuristics. At this juncture, the constraints have to be converted into the Propositional Satisfiability Problem (SAT). Obtaining a feasible solution is done by the DPLL algorithm which is the core of most modern SAT solvers. It is shown in detail how this methodology is implemented in Multi-objective Evolutionary Algorithms. The SAT solver is used to obtain feasible solutions from the genetic encoded information on arbitrarily hard solvable problems where common methods like penalty functions or repair strategies are failing. Handmade test cases are used to compare various configurations of the SAT solver. On an industrial example, the proposed methodology is compared to common strategies which are used to obtain feasible solutions. © 2007 IEEE.}, author = {Lukasiewycz, Martin and Glaß, Michael and Haubelt, Christian and Teich, Jürgen}, booktitle = {In Proceedings of the 2007 IEEE Congress on Evolutionary Computation (CEC 2007)}, date = {2007-09-25/2007-09-28}, doi = {10.1109/CEC.2007.4424570}, faupublication = {yes}, isbn = {9781424413409}, pages = {935-942}, peerreviewed = {unknown}, title = {{SAT}-decoding in evolutionary algorithms for discrete constrained optimization problems}, venue = {Singapore}, year = {2007} } @article{faucris.115228344, abstract = {This letter presents a systematic approach to efficiently handle a very large number of power domains in modern coarse-grained reconfigurable arrays in order to tightly match the different computational demands of processed algorithms with corresponding power consumption. It is based on a new highly scalable and generic power control network and additionally uses the state-of-the-art common power format based front-to-backend design methodology for a fully automated implementation. The power management is transparent to the user and is seamlessly integrated into the overall reconfiguration process: reconfiguration-controlled power gating. Furthermore, for the first time, a coarse-grained reconfigurable case study design with as many as 24 switchable power domains with detailed results on power savings and overheads is presented. The application of the proposed technique results in 60% active leakage and 90% standby leakage power reduction for several digital signal processing algorithms. © 2009 IEEE.}, author = {Kissler, Dmitrij and Gran, Daniel and Salcic, Zoran and Hannig, Frank and Teich, Jürgen}, doi = {10.1109/LES.2011.2124438}, faupublication = {yes}, journal = {IEEE Embedded Systems Letters}, keywords = {Coarse-grained reconfigurable arrays; common power format; power gating}, note = {UnivIS-Import:2015-04-14:Pub.2011.tech.IMMD.inform.scalab}, pages = {58-61}, peerreviewed = {unknown}, title = {{Scalable} {Many}-{Domain} {Power} {Gating} in {Coarse}-grained {Reconfigurable} {Processor} {Arrays}}, volume = {3}, year = {2011} } @inproceedings{faucris.118709184, abstract = {One of the most important figures of merit concerning mobile devices is their energy efficiency. However, the modeling of such systems is mainly focused on the functional aspects, but not on the power consumption. Therefore, power estimations can only be accomplished late in the design process, after all architectural decisions have been taken. This might lead to designs that are not optimal with respect to power consumption. To overcome these limitations, this article proposes an efficient and flexible power modeling methodology at system level for heterogeneous integrated systems, allowing to combine state-dependent and environmental dependent power models. In a case study, the energy consumption of an RF transceiver of a mobile phone is estimated in a real-life scenario, including the baseband module, the radio channel, and the base transceiver station in a simulative way. © 2013 European Microwave Association.}, author = {Glock, Stefan and Rosales, Rafael and Reutelhuber, Franz and Glaß, Michael and Teich, Jürgen and Fischer, Georg and Weigel, Robert and Ußmüller, Thomas}, booktitle = {Proc. 43rd European Microwave Conference}, date = {2013-10-07/2013-10-10}, faupublication = {yes}, note = {elib2cris::1512001600,glock2013b}, pages = {342-345}, peerreviewed = {Yes}, title = {{Scenario}-{Based} {Energy} {Estimation} of {Heterogeneous} {Integrated} {Systems} at {System} {Level}}, venue = {Nuremberg}, year = {2013} } @inproceedings{faucris.236055517, author = {Spieck, Jan and Wildermann, Stefan and Teich, Jürgen}, booktitle = {Proceedings of the 57th Annual Design Automation Conference (DAC)}, date = {2020-07-19/2020-07-23}, doi = {10.1109/DAC18072.2020.9218537}, faupublication = {yes}, peerreviewed = {No}, title = {{Scenario}-{Based} {Soft} {Real}-{Time} {Hybrid} {Application} {Mapping} for {MPSoCs}}, venue = {San Francisco}, year = {2020} } @misc{faucris.123674364, author = {Naedele, Martin and Strehl, Karsten and Teich, Jürgen and Thiele, Lothar and Ziegenbein, Dirk}, faupublication = {no}, month = {Jan}, peerreviewed = {automatic}, title = {{SCF} - {State} {Machine} {Controlled} {Flow} {Diagrams}}, year = {1998} } @inproceedings{faucris.117912124, address = {Berlin}, author = {Angermeier, Josef and Fekete, Sandor P. and Göhringer, Diana and Majer, Mateusz and Teich, Jürgen and Van Der Veen, Jan C.}, booktitle = {Proc. of the 20th International Conference on Architecture of Computing Systems}, faupublication = {yes}, note = {UnivIS-Import:2015-04-16:Pub.2007.tech.IMMD.inform.schedu}, pages = {151-160}, publisher = {VDE-Verlag}, title = {{Scheduling} and communication-aware mapping of {HW}-{SW} modules for dynamically and partially reconfigurable {SoC} architectures}, venue = {Zurich}, year = {2007} } @inproceedings{faucris.115651624, abstract = {In this paper, a scheduling method for heterogeneous embedded systems is developed. At first, an internal representation model called FunState is presented which enables the explicit representation of non-determinism and scheduling using a combination of functions and state machines. The new scheduling method is able to deal with mixed data/control flow specifications and takes into account different mechanisms of non-determinism as occurring in the design of embedded systems. Constraints imposed by other already implemented components are respected. The scheduling approach avoids the explicit enumeration of execution paths by using symbolic techniques and guarantees to find a deadlock-free and bounded schedule if one exists. The generated schedule consists of statically scheduled basic blocks which are dynamically called at run time.}, author = {Ernst, Rolf and Strehl, Karsten and Teich, Jürgen and Thiele, Lothar and Ziegenbein, Dirk}, booktitle = {Proc. CODES'99, the 7th Int. Workshop on Hardware/Software Co-Design}, date = {1999-05-03/1999-05-05}, faupublication = {no}, note = {UnivIS-Import:2015-04-16:Pub.1999.tech.IMMD.inform.schedu}, pages = {173-177}, publisher = {IEEE}, title = {{Scheduling} {Hardware}/{Software} {Systems} {Using} {Symbolic} {Techniques}}, venue = {Rome}, year = {1999} } @inproceedings{faucris.119898284, abstract = {A single integer linear programming model for optimally scheduling partitioned regular algorithms is presented. The herein presented methodology differs from existing methods in the following capabilities: 1) Not only constraints on the number of available processors and communication capabilities are taken into account, but also processor caches and constraints on the size of available memories are modeled and taken into account in the optimization model. 2) Different types of processors can be handled. 3) The size of the optimization model (number of integer variables) is independent of the size of the tiles to be executed. Hence, 4) the number of integer variables in the optimization model is greatly reduced such that problems of relevant size can be solved in practical execution time.}, author = {Teich, Jürgen and Thiele, Lothar and Zhang, L}, booktitle = {Proc. Int. Conf. on Application-Specific Systems, Architectures, and Processors (ASAP´96)Int. Conf. on Application-Specific Systems, Architectures, and Processors (ASAP´96)Int. Conf. on Application-Specific Systems, Architectures, and Processors}, date = {1996-08-19/1996-08-21}, faupublication = {no}, note = {UnivIS-Import:2015-04-16:Pub.1996.tech.IMMD.inform.schedu}, pages = {131-144}, publisher = {IEEE}, title = {{Scheduling} of partitioned regular algorithms on processor arrays with constrained resources}, venue = {Chicago, U.S.A.,}, year = {1996} } @article{faucris.115187864, author = {Kissler, Dmitrij and Hannig, Frank and Teich, Jürgen}, faupublication = {yes}, journal = {Design & Elektronik}, note = {UnivIS-Import:2015-04-14:Pub.2007.tech.IMMD.inform.schwac}, pages = {34-39}, peerreviewed = {unknown}, title = {{Schwach}-programmiert macht stark}, year = {2007} } @inproceedings{faucris.117739204, address = {USA}, author = {Koch, Dirk and Teich, Jürgen and Körber, Mathias}, booktitle = {Proceedings of International Conference on Engineering of Reconfigurable Systems and Algorithms}, date = {2006-06-26/2006-06-29}, faupublication = {yes}, isbn = {1-60132-011-6}, note = {UnivIS-Import:2015-04-16:Pub.2006.tech.IMMD.inform.search}, pages = {42-48}, publisher = {CSREA Press}, title = {{Searching} {RC5}-{Keys} with {Distributed} {Reconfigurable} {Computing}}, venue = {Las Vegas}, year = {2006} } @inproceedings{faucris.232448347, abstract = {In modern embedded systems, the trust in comprehensive security standards all along the product life cycle has become an increasingly important access-to-market requirement. However, these security standards rely on mandatory immunity assumptions such as the integrity and authenticity of an initial system configuration typically loaded from Non-Volatile Memory (NVM). This applies especially to FPGA-based programmable system-on-chip (PSoC) architectures, since object codes as well as configuration data easily exceed the capacity of a secure boot ROM. In this context, an attacker could try to alter the content of the NVM device in order to manipulate the system. The PSoC therefore relies on the integrity of the NVM particularly at boot-time. In this paper, we propose a methodology for securely booting from an NVM in a potentially unsecure environment by exploiting the reconfigurable logic of the FPGA. Here, the FPGA serves as a secure anchor point by performing required integrity and authenticity verifications prior to the configuration and execution of any user application loaded from the NVM on the PSoC. The proposed secure boot process is based on the following assumptions and steps: 1) The boot configuration is stored on a fully encrypted Secure Digital memory card (SD card) or alternatively Flash acting as NVM. 2) At boot time, a hardware design called Trusted Memory-Interface Unit (TMIU) is loaded to verify first the authenticity of the deployed NVM and then after decryption the integrity of its content. To demonstrate the practicability of our approach, we integrated the methodology into the vendor-specific secure boot process of a Xilinx Zynq PSoC and evaluated the design objectives performance, power and resource cost}, author = {Streit, Franz-Josef and Fritz, Florian and Becher, Andreas and Wildermann, Stefan and Werner, Stefan and Schmidt-Korth, Martin and Pschyklenk, Michael and Teich, Jürgen}, booktitle = {IEEE Proceedings of the 13th International Symposium on Hardware Oriented Security and Trust}, date = {2020-12-07/2020-12-11}, doi = {10.1109/HOST45689.2020.9300126}, faupublication = {yes}, keywords = {Secure Boot; Non-Volatile Memory Protection; Programmable System-on-Chip; FPGA; Hardware/Software Co-Design;}, peerreviewed = {Yes}, title = {{Secure} {Boot} from {Non}-{Volatile} {Memory} for {Programmable} {SoC}-{Architectures}}, venue = {San José, USA}, year = {2020} } @inproceedings{faucris.108593584, author = {Pirkl, Jutta and Becher, Andreas and Echavarria Gutiérrez, Jorge Alfonso and Teich, Jürgen and Wildermann, Stefan}, booktitle = {Proceedings of the 20th International Workshop on Software and Compilers for Embedded Systems}, date = {2017-06-12/2017-06-13}, doi = {10.1145/3078659.3078669}, faupublication = {yes}, pages = {89-92}, peerreviewed = {unknown}, title = {{Self}-{Adaptive} {FPGA}-{Based} {Image} {Processing} {Filters} {Using} {Approximate} {Arithmetics}}, venue = {Sankt Goar}, year = {2017} } @inproceedings{faucris.119107604, abstract = {The recent years have shown the emergence of heterogeneous system architecture (HSA), which offers massive computational power assembled into a compact design. Computer vision applications with massive inherent parallelism highly benefits from such heterogeneous processors with on-chip CPU and GPU units. The highly parallel and compute intensive parts of the application program can be mapped to the GPU while the control flow and high level tasks may run on the CPU. However, they pose considerable challenge to software development due to their hybrid architecture. Sharing of resources (GPU or CPU) among applications running concurrently, leads to variations in processing interval and prolonged processing intervals leads to low quality results (frame drops) for computer vision algorithms. In this work, we propose resource-awareness and self organisation within the application layer to adapt to available resources on the heterogeneous processor. The benefits of the new model is demonstrated using a widely used computer vision algorithm called Harris corner detector. A resource-aware runtime-system and a heterogeneous processor were used for evaluation and the results indicate a well constrained processing interval and reduced frame-drops. Our evaluations demonstrate up to 20% improvements in processing rate and accuracy of the detected corner points for Harris corner detectio}, address = {Gières, France}, author = {Paul, Johny and Stechele, Walter and Sousa, Éricles and Lari, Vahid and Hannig, Frank and Teich, Jürgen and Kröhnert, Manfred and Asfour, Tamim}, booktitle = {Proc. of the Conference on Design and Architectures for Signal and Image Processing (DASIP)}, date = {2014-10-08/2014-10-10}, doi = {10.1109/DASIP.2014.7115616}, faupublication = {yes}, isbn = {979-10-92279-06-1}, note = {UnivIS-Import:2015-04-16:Pub.2014.tech.IMMD.inform.selfad}, pages = {1-6}, peerreviewed = {unknown}, publisher = {ECSI Media}, title = {{Self}-{Adaptive} {Harris} {Corner} {Detection} on {Heterogeneous} {Many}-core {Processor}}, venue = {Madrid}, year = {2014} } @inproceedings{faucris.117318124, author = {Wildermann, Stefan and Teich, Jürgen}, booktitle = {Proceedings of the Workshop on Self-Improving System Integration (SISSY 2014)}, date = {2014-09-08/2014-09-08}, faupublication = {yes}, pages = {8}, peerreviewed = {unknown}, title = {{Self}-{Integration} for {Virtualization} of {Embedded} {Many}-{Core} {Systems}}, venue = {London}, year = {2014} } @inproceedings{faucris.118494024, abstract = {A growing number of control systems are distributed and based on the use of a communication bus. The distributed nodes execute periodic tasks, which access the bus by releasing the messages using a priority-based mechanism with the goal of minimal message response times. Instead of randomly accessing the bus, a dynamic scheduling of messages technique based on adaptation of time offsets between message releases is used. The presented algorithm, called DynOAA, is executing on each node of the distributed system. It takes into account the current traffic on the bus and tries to avoid simultaneous release of messages by different nodes, hence reduces the likelihood of conflicts and need for repeated release. In this paper, we first address single bus (segment) systems and then extend the model and the offset adaptation algorithm to systems that use multiple buses (segments) connected by a communication gateway. A rating function based on the average of maximum response times is used to analyze DynOAA for the case of CAN-bus systems based on bit-accurate simulations. Experiments show the robustness of the algorithm (1) in case of fully asynchronous systems, (2) ability to deal with systems that change their configuration (add or remove message release nodes) dynamically and (3) model systems containing multiple bus segments connected by a gateway. The approach is also applicable to other priority-based bus systems. © 2011 Springer-Verlag.}, address = {Heidelberg}, author = {Ziermann, Tobias and Salcic, Zoran and Teich, Jürgen}, booktitle = {Proc. of the 8th International Conference on Autonomic and Trusted Computing}, date = {2011-09-02/2011-09-04}, doi = {10.1007/978-3-642-23496-5{\_}10}, faupublication = {yes}, isbn = {978-3-642-23495-8}, note = {UnivIS-Import:2015-04-16:Pub.2011.tech.IMMD.inform.selfor}, pages = {132-148}, publisher = {Springer-verlag}, series = {Lecture Notes in Computer Science (LNCS)}, title = {{Self}-organized {Message} {Scheduling} for {Asynchronous} {Distributed} {Embedded} {Systems}}, venue = {Banff}, volume = {6906}, year = {2011} } @inproceedings{faucris.118308344, abstract = {In this paper, we present an analysis of self-organizing bandwidth sharing in priority-based medium access. For this purpose, the priority-based Access Game is introduced. Analysis shows that a fair distribution of bandwidth cannot be achieved in this game. Therefore, we enhance this game by introducing a constraint that demands a small amount of the overall bandwidth being free. Fair bandwidth sharing is one Nash Equilibrium of this enhanced game, but not a unique one. Based upon this theoretical analysis, a multi-agent reinforcement learning algorithm is proposed, where each agent tries to maximize its success rate for accessing the medium, while avoiding to violate the bandwidth constraint. We experimentally evaluate this mechanism for a system comprised of selfish agents. Experimental results show that the system is able to self-organize itself towards a fair distribution of bandwidth in a totally decentralized way without the need of global information or coordination. © 2009 IEEE.}, author = {Wildermann, Stefan and Ziermann, Tobias and Teich, Jürgen}, booktitle = {Proc. 3rd IEEE International Conference on Self-Adaptive and Self-Organizing Systems}, date = {2009-09-14/2009-09-18}, doi = {10.1109/SASO.2009.18}, faupublication = {yes}, note = {UnivIS-Import:2015-04-16:Pub.2009.tech.IMMD.inform.selfor}, pages = {144-153}, title = {{Self}-{Organizing} {Bandwidth} {Sharing} in {Priority}-based {Medium} {Access}}, venue = {San Francisco}, year = {2009} } @inproceedings{faucris.121632984, abstract = {Computer vision is one of the key research topics of modern computer science and finds application in manufacturing, surveillance, automotive, robotics, and sophisticated human-machine-interfaces. These applications require small and efficient solutions which are commonly provided as embedded systems. This means that there exist resource constraints, but also the need for increasing adaptivity and robustness. This paper proposes an autonomic computing framework for robust object tracking. A probabilistic tracking algorithm is combined with the use of multi-filter fusion of redundant image filters. The system can react on unpredictable changes in the environment through self-adaptation. Due to resource constraints, the number of filters actively used for tracking is limited. By means of self-organization, the system structure is re-organized to activate filters adequate for the current context. The proposed framework is designed for, but not limited to, embedded computer vision. Experimental evaluations demonstrate the benefit of the approach. © 2010 Springer-Verlag.}, author = {Wildermann, Stefan and Oetken, Andreas and Teich, Jürgen and Salcic, Zoran}, booktitle = {Proceedings of the 7th International Conference on Autonomic and Trusted Computing}, date = {2010-10-26/2010-10-29}, doi = {10.1007/978-3-642-16576-4{\_}1}, faupublication = {yes}, isbn = {9783642165757}, pages = {1-16}, peerreviewed = {unknown}, title = {{Self}-organizing computer vision for robust object tracking in smart cameras}, venue = {Xi'an}, year = {2010} } @inproceedings{faucris.118710064, author = {Ziermann, Tobias and Wildermann, Stefan and Teich, Jürgen}, booktitle = {Proc. Parallel-Algorithmen, -Rechnerstrukturen und -Systemsoftware}, date = {2013-04-11/2013-04-12}, faupublication = {yes}, note = {UnivIS-Import:2015-04-16:Pub.2013.tech.IMMD.inform.selfor}, pages = {1-10}, title = {{Self}-organizing {Core} {Allocation}}, venue = {Erlangen}, year = {2013} } @inproceedings{faucris.118309884, abstract = {Self-organization is a natural concept that helps complex systems to adapt themselves autonomically to their environment. In this paper, we present a self-organizing framework for multi-cue fusion in embedded imaging. This means that several simple image filters are used in combination to lead to a more robust system behavior. Human motion tracking serves as a show case. The system adapts to changes in the environment while tracking a person. Besides this, system customization can be simplified. The designer just has to select a desired set of image filters for a given task. The system then finds the appropriate parameters, e.g., the weighting of different cues. With the option of partial re-configuration, FPGAs support this type of customization. An FPGA-based prototype implementation demonstrates the feasibility of this approach. Tracking and adaptation work in real-time with 25 FPS and a resolution of 640x480. ©2009 IEEE.}, author = {Wildermann, Stefan and Walla, Gregor and Ziermann, Tobias and Teich, Jürgen}, booktitle = {Proc. 19th International Conference on Field-Programmable Logic and Applications}, date = {2009-08-31/2009-09-02}, doi = {10.1109/FPL.2009.5272523}, faupublication = {yes}, note = {UnivIS-Import:2015-04-16:Pub.2009.tech.IMMD.inform.selfor{\_}1}, pages = {132-137}, title = {{Self}-{Organizing} {Multi}-cue {Fusion} for {FPGA}-based {Embedded} {Imaging}}, venue = {Prague}, year = {2009} } @inproceedings{faucris.121663124, abstract = {In this paper, we introduce the concept of (self-)reconfigurable finite state machines as a formal model to describe state-machines implemented in hardware that may be reconfigured during operation. By the advent of reconfigurable logic devices such as FPGAs, this model may become important to characterize and implement (self-)reconfigurable hardware. An FSM is called (self-)reconfigurable if reconfiguration of either output function or transition function is initiated by the FSM itself and not based on external reconfiguration events. We propose an efficient hardware realisation and give algorithmic solutions and bounds for the reconfiguration overhead of migrating a given FSM specification into a new target FSM. © 2002 IEEE.}, author = {Teich, Jürgen and et al.}, author_hint = {Köster M., Teich J.}, booktitle = {Proc. DATE 2002, Design, Automation and Test in Europe}, doi = {10.1109/DATE.2002.998356}, faupublication = {no}, pages = {559-566}, peerreviewed = {unknown}, support_note = {Author relations incomplete. You may find additional data in field 'author{\_}hint'}, title = {({Self}-)reconfigurable finite state machines: {Theory} and implementation}, venue = {Paris}, year = {2002} } @inproceedings{faucris.122023924, author = {Streubühr, Martin and Jäntsch, Michael and Haubelt, Christian and Teich, Jürgen and Schneider, Axel}, booktitle = {11. GI/ITG/GMM-Workshop Methoden und Beschreibungssprachen zur Modellierung und Verifikation von Schaltungen und Systemen}, date = {2008-03-03/2008-03-05}, faupublication = {yes}, pages = {139-148}, peerreviewed = {unknown}, title = {{Semi}-{Automatic} {Generation} of mixed {Hardware}-{Software} {Prototypes} from {Simulink} {Models}}, venue = {Freiburg}, year = {2008} } @inproceedings{faucris.264786442, abstract = {Locating a cryptographic operation in a side channel trace, i.e. finding out where it is in the time domain, without having a template, can be a tedious task even for unprotected implementations. The sheer amount of data can be overwhelming. In a simple call to OpenSSL for AES-128 ECB encryption of a single data block, only 0.00028% of the trace relate to the actual AES-128 encryption. The rest is overhead.
We introduce the (to our best knowledge) first method to locate a cryptographic operation in a side channel trace in a largely automated fashion. The method exploits meta information about the cryptographic operation and requires an estimate of its implementation's execution time.
The method lends itself to parallelization and our implementation in a tool greatly benefits from GPU acceleration. The tool can be used offline for trace segmentation and for generating a template which can then be used online in real-time waveform-matching based triggering systems for trace acquisition or fault injection. We evaluate it in six scenarios involving hardware and software implementations of different cryptographic operations executed on diverse platforms. Two of these scenarios cover realistic protocol level use-cases and demonstrate the real-world applicability of our tool in scenarios where classical leakage-detection techniques would not work. The results highlight the usefulness of the tool because it reliably and efficiently automates the task and therefore frees up time of the analyst.
The method does not work on traces of implementations protected by effective time randomization countermeasures, e.g., random delays and unstable clock frequency, but is not affected by masking, shuffling and similar countermeasures.Particularly when such systems are deployed in areas without access to a static power supply, they have to be powered using energy harvesting to operate autonomously.
Objectives such as availability and data loss rate depend on the set of attached sensors, the system configuration (e.g., used PV module, batteries, and data storage), as well as environmental factors such as the location of the deployed system.
Moreover, also the employed energy management strategy and its parametrization severely influence the system characteristics.
In fact, different strategies can lead to different tradeoffs in terms of the above objectives.
In this paper we propose a design methodology to automatically explore the design space of configurations of multi-sensor embedded systems and to determine and configure the best energy management strategy for a given sensor configuration and location.
Our methodology includes a real-time analysis and a simulation-based DSE to explore the design space.
We investigate a case study from a biomonitoring project and demonstrate the benefits of the proposed design methodology: A system---including its configuration and energy management strategy---has to be tailored to the characteristics of the set of attached sensors and the location it operates.
Else designs exhibit suboptimal characteristics when operating at sites or for sensor sets for which they were not optimized. method aims at generating correlated samples from the uncertainty distribution of components’ reliability such that the shape and statistical properties of each individual distribution remain unchanged. Experimental results confirm that the proposed correlation model enables the employed uncertainty-aware analysis to accurately calculate uncertainty at system leve}, author = {Khosravi, Faramarz and Müller, Malte and Glaß, Michael and Teich, Jürgen}, doi = {10.1177/1748006X18758720}, faupublication = {yes}, journal = {Proceedings of the Institution of Mechanical Engineers, Part O: Journal of Risk and Reliability}, keywords = {Reliability analysis, uncertainty correlation modeling, Monte Carlo simulation, system-level design}, pages = {725-737}, peerreviewed = {Yes}, title = {{Simulation}-based {Uncertainty} {Correlation} {Modeling} in {Reliability} {Analysis}}, url = {https://journals.sagepub.com/doi/abs/10.1177/1748006X18758720}, volume = {232}, year = {2018} } @inproceedings{faucris.118576524, address = {Ghent, Belgium}, author = {Roloff, Sascha and Hannig, Frank and Teich, Jürgen}, booktitle = {Proc. of the 8th International Summer School on Advanced Computer Architecture and Compilation for High-Performance and Embedded Systems (ACACES)}, date = {2012-07-08/2012-07-14}, faupublication = {yes}, isbn = {978-90-382-1987-5}, note = {UnivIS-Import:2015-04-16:Pub.2012.tech.IMMD.inform.simula}, pages = {127-130}, publisher = {Academia Press}, title = {{Simulation} of {Resource}-{Aware} {Applications} on {Heterogeneous} {Architectures}}, venue = {Fiuggi}, year = {2012} } @inproceedings{faucris.122816144, abstract = {Embedded real-time image processing applications working on large images have to process and store huge amounts of data. Consequently the organization of the memory buffers and the precise determination of the required buffer sizes are critical steps for efficient system implementation. In this paper, we propose a new method, that permits the analysis to be performed automatically for local image processing algorithms. The latter ones are specified by help of the Windowed Synchronous Data Flow (WSDF) model, a multi-dimensional model of computation which has been especially designed to represent local image processing algorithms. This paper introduces a corresponding buffer organization leading to solutions comparable to hand-built designs concerning the required memory. Special care is taken, so that also large problems in terms of the image size can be analyzed. The applicability of our approach is demonstrated by help of a JPEG2000 decoder model. © 2007 IEEE.}, author = {Keinert, Joachim and Haubelt, Christian and Teich, Jürgen}, booktitle = {Proceedings of the International Conference on Embedded Computer Systems: Architectures, Modeling and Simulation, Acoustics, Speech, and Signal Processing (IC-SAMOS VII)}, date = {2007-07-16/2007-07-19}, doi = {10.1109/ICSAMOS.2007.4285747}, faupublication = {yes}, isbn = {9781424410583}, pages = {161-168}, peerreviewed = {unknown}, title = {{Simulative} buffer analysis of local image processing algorithms described by windowed synchronous data flow}, venue = {Samos}, year = {2007} } @inproceedings{faucris.316294661, abstract = {Convolutional Neural Networks (CNNs) are widely employed to solve various problems, e.g., image classification. Due to their compute- and data-intensive nature, CNN accelerators have been developed as ASICs or on FPGAs. The increasing complexity of applications has caused resource costs and energy requirements of these accelerators to grow. Spiking Neural Networks (SNNs) are an emerging alternative to CNN implementations, promising higher resource and energy efficiency. The main research question addressed in this paper is whether SNN accelerators truly meet these expectations of reduced energy demands compared to their CNN equivalents when implemented on modern FPGAs. For this purpose, we analyze multiple SNN hardware accelerators for FPGAs regarding performance and energy efficiency. We also present a novel encoding scheme of spike event queues and a novel memory organization technique to improve SNN energy efficiency further. Both techniques have been integrated into a state-of-the-art SNN architecture and evaluated for MNIST, SVHN, and CIFAR-10 data sets and corresponding network architectures on two differently sized modern FPGA platforms. A result of our empirical analysis is that for complex benchmarks such as SVHN and CIFAR-10, SNNs do live up to their expectations.There exists a wealth of supporting libraries and frameworks that aid programmers with the implementation of applications working on such grids, each built on top of existing parallelization technologies. However, many approaches require the programmer to introduce a different programming paradigm into their application or provide different variants of the code. SYCL is a new programming standard providing a remedy to this dilemma by building on standard C ++17 with its so-called single-source approach: Programmers write standard C ++ code and expose parallelism using C++17 keywords. The application is
then transformed into a concrete implementation by the SYCL implementation. By encapsulating the OpenCL ecosystem, different SYCL implementations enable not only the programming of CPUs but also of heterogeneous platforms such as GPUs or other devices. For the first time, this paper showcases a SYCL-based solver for the nodal Discontinuous Galerkin method for Maxwell’s equations on unstructured meshes. We compare our solution to a previous C-based implementation with respect to programmability and performance on heterogeneous platforms.

Big Data applications frequently involve the processing of data streams encoded in semi-structured data formats such as JSON.

A major challenge is that the parsing of such data formats is usually highly complex.

Accelerating JSON parsing on FPGAs has therefore become a focus of recent research.

FPGA accelerators were presented which serve as a co-processor for a CPU to convert JSON into a format that is easier for the CPU to process, e.g., Apache Arrow.

However, in case the parsed data should be further processed on the FPGA, such solutions are insufficient

as the format created is unsuitable for further processing on FPGAs and, above all, because the accelerators have an immense resource requirement.

We, therefore, present a novel FPGA parser architecture that is able to interpret JSON data to selectively extract attributes based on a query expression into a format suitable for stream processing on the FPGA.

We furthermore show how the sparsity of JSON can be used to implement a resource-efficient design, only requiring few FPGA resources.

This leaves the major share of resources free for accelerating subsequent processing steps of a given application.

Our experimental evaluation shows that we can achieve a throughput of 36.5~MB/s per kLUTs which is about 2.5 times higher than the throughput per LUT achievable on the most efficient related approach.

}, author = {Hahn, Tobias and Wildermann, Stefan and Teich, Jürgen}, booktitle = {IEEE Proceedings of the 33rd International Conference on Field Programmable Logic and Applications}, date = {2023-09-04/2023-09-08}, doi = {10.1109/FPL60245.2023.00034}, faupublication = {yes}, keywords = {JSON; FPGA; parsing; architecture}, peerreviewed = {Yes}, title = {{SPEAR}-{JSON}: {Selective} parsing of {JSON} to enable accelerated stream processing on {FPGAs}}, venue = {Göteborg}, year = {2023} } @article{faucris.305829207, author = {Bosio, Alberto and Barbareschi, Mario and Savino, Alessandro and Han, Jie and Teich, Jürgen}, doi = {10.1109/MDAT.2022.3221909}, faupublication = {yes}, journal = {IEEE Design & Test}, pages = {5-7}, peerreviewed = {Yes}, title = {{Special} {Issue} on {Approximate} {Computing}: {Challenges}, {Methodologies}, {Algorithms}, and {Architectures} for {Dependable} and {Secure} {Systems}}, volume = {40}, year = {2023} } @inproceedings{faucris.308906651, abstract = {This paper explores the challenges and opportunities of integrating
non-volatile memories (NVMs) into embedded systems for ma-
chine learning. NVMs offer advantages such as increased memory
density, lower power consumption, non-volatility, and compute-in-
memory capabilities. The paper focuses on integrating NVMs into
embedded systems, particularly in intermittent computing, where
systems operate during periods of available energy. NVM technolo-
gies bring persistence closer to the CPU core, enabling efficient
designs for energy-constrained scenarios. Next, computation in re-
sistive NVMs is explored, highlighting its potential for accelerating
machine learning algorithms. However, challenges related to relia-
bility and device non-idealities need to be addressed. The paper also
discusses memory-centric machine learning, leveraging NVMs to
overcome the memory wall challenge. By optimizing memory lay-
outs and utilizing probabilistic decision tree execution and neural
network sparsity, NVM-based systems can improve cache behavior
and reduce unnecessary computations. In conclusion, the paper
emphasizes the need for further research and optimization for the
widespread adoption of NVMs in embedded systems presenting
relevant challenges, especially for machine learning applications.

As a solution, this article proposes a a unique mixed static/dynamic approach called symbolic loop compilation. It is shown that at compile time, the NP-complete problems (modulo scheduling, register allocation, and routing) can still be solved to optimality in a symbolic way resulting in a so-called symbolic configuration, a space-efficient intermediate representation parameterized in the loop bounds and number of PEs. This phase is called symbolic mapping. At runtime, for each requested accelerated execution of a loop program with given loop bounds and known number of available PEs, a concrete configuration, including PE programs and configuration data for all other components, is generated from the symbolic configuration according to these parameter values. This phase is called instantiation. We describe both phases in detail and show that instantiation runs in polynomial time with its most complex step, program instantiation, not directly depending on the number of PEs and thus scaling to arbitrary sizes of TCPAs.

To validate the efficiency of this mixed static/dynamic compilation approach, we apply symbolic loop compilation to a set of real-world loop programs from several domains, measuring both compilation time and space requirements. Our experiments confirm that a symbolic configuration is a space-efficient representation suited for systems with little memory---in many cases, a symbolic configuration is smaller than even a single concrete configuration instantiated from it---and that the times for the run-time phase of program instantiation and configuration loading are negligible and moreover independent of the size of the available processor array. To give an example, instantiating a configuration for a matrix-matrix multiplication benchmark takes equally long for 4x4 and 32x32 PEs.

}, author = {Witterauf, Michael and Walter, Dominik and Hannig, Frank and Teich, Jürgen}, doi = {10.1145/3466897}, faupublication = {yes}, journal = {ACM Transactions on Embedded Computing Systems}, peerreviewed = {Yes}, title = {{Symbolic} {Loop} {Compilation} for {Tightly} {Coupled} {Processor} {Arrays}}, year = {2021} } @inproceedings{faucris.109562244, abstract = {Loop parallelization techniques for massively parallel processor arrays using one-level tiling are often either I/O- or memory-bounded, exceeding the target architecture's capabilities. Furthermore, if the number of available processing elements is only known at runtime - as in adaptive systems - static approaches fail. To solve these problems, we present a hybrid compile/runtime technique to symbolically parallelize loop nests with uniform dependences on multiple levels. At compile time, two novel transformations are performed: (a) symbolic hierarchical tiling followed by (b) symbolic multi-level scheduling. By tuning the size of the tiles on multiple levels, a trade-off between the necessary I/O-bandwidth and memory is possible, which facilitates obeying resource constraints. The resulting schedules are symbolic with respect to the number of tiles; thus, the number of processing elements to map onto does not need to be known at compile time. At runtime, when the number is known, a simple prolog chooses a feasible schedule with respect to I/O and memory constraints that is latency-optimal for the chosen tile size. In this way, our approach dynamically chooses latency-optimal and feasible schedules while avoiding expensive re-compilations.}, author = {Tanase, Alexandru-Petru and Witterauf, Michael and Hannig, Frank and Teich, Jürgen}, booktitle = {Proceedings of the 13th ACM-IEEE International Conference on Formal Methods and Models for System Design (MEMOCODE)}, date = {2015-09-21/2015-09-23}, doi = {10.1109/MEMCOD.2015.7340486}, faupublication = {yes}, isbn = {9781509002375}, keywords = {Mathematical model; Memory management; Processor scheduling; Runtime; Schedules; Silicon}, pages = {188-197}, peerreviewed = {unknown}, publisher = {Institute of Electrical and Electronics Engineers Inc.}, title = {{Symbolic} loop parallelization for balancing {I}/{O} and memory accesses on processor arrays}, venue = {Austin}, year = {2015} } @inproceedings{faucris.118577404, address = {Ghent, Belgium}, author = {Tanase, Alexandru-Petru and Hannig, Frank and Teich, Jürgen}, booktitle = {Proc. of the 8th International Summer School on Advanced Computer Architecture and Compilation for High-Performance and Embedded Systems (ACACES)}, date = {2012-07-08/2012-07-14}, faupublication = {yes}, isbn = {978-90-382-1987-5}, note = {UnivIS-Import:2015-04-16:Pub.2012.tech.IMMD.inform.symbol}, pages = {33-36}, publisher = {Academia Press}, title = {{Symbolic} loop parallelization of static control programs}, venue = {Fiuggi}, year = {2012} } @incollection{faucris.120677524, abstract = {In this paper, we present a solution to the problem of joint tiling and scheduling a given loop nest with uniform data dependencies symbolically. This challenge arises when the size and number of available processors for parallel loop execution is not known at compile time. But still, in order to avoid any overhead of dynamic (run-time) recompilation, a schedule of loop iterations shall be computed and optimized statically. In this paper, it will be shown that it is possible to derive parameterized latency-optimal schedules statically by proposing a two step approach: First, the iteration space of a loop program is tiled symbolically into orthotopes of parametrized extensions. Subsequently, the resulting tiled program is also scheduled symbolically, resulting in a set of latency-optimal parameterized schedule candidates. At run time, once the size of the processor array becomes known, simple comparisons of latency-determining expressions finally steer which of these schedules will be dynamically selected and the corresponding program configuration executed on the resulting processor array so to avoid any further run-time optimization or expensive recompilation. Our theory of symbolic loop parallelization is applied to a number of loop programs from the domains of signal processing and linear algebra. Finally, as a proof of concept, we demonstrate our proposed methodology for a massively parallel processor array architecture called tightly coupled processor array (TCPA) on which applications may dynamically claim regions of processors in the context of invasive computing.}, address = {Berlin; Heidelberg}, author = {Teich, Jürgen and Tanase, Alexandru-Petru and Hannig, Frank}, booktitle = {Journal of Signal Processing Systems}, doi = {10.1007/s11265-014-0905-0}, faupublication = {yes}, keywords = {Symbolic Loop Parallelisation}, note = {UnivIS-Import:2015-04-20:Pub.2014.tech.IMMD.inform.symbol{\_}0}, pages = {31-59}, peerreviewed = {Yes}, publisher = {Springer-Verlag}, title = {{Symbolic} {Mapping} of {Loop} {Programs} onto {Processor} {Arrays}}, volume = {77(1-2)}, year = {2014} } @inproceedings{faucris.117312184, author = {Graf, Sebastian and Glaß, Michael and Teich, Jürgen}, booktitle = {Proceedings of 18. Workshop Methoden und Beschreibungssprachen zur Modellierung und Verifikation von Schaltungen und Systemen (MBMV 2015)}, date = {2015-03-03/2015-03-04}, faupublication = {yes}, pages = {115-124}, peerreviewed = {unknown}, title = {{Symbolic} {Message} {Routing} for {Multi}-{Objective} {Optimization} of {Automotive} {E}/{E} {Architecture} {Component} {Platforms}}, venue = {Chemnitz}, year = {2015} } @article{faucris.119322104, author = {Tanase, Alexandru-Petru and Witterauf, Michael and Teich, Jürgen and Hannig, Frank}, doi = {10.1145/3092952}, faupublication = {yes}, journal = {ACM Transactions on Embedded Computing Systems}, pages = {31:1-31:27}, peerreviewed = {Yes}, title = {{Symbolic} {Multi}-{Level} {Loop} {Mapping} of {Loop} {Programs} for {Massively} {Parallel} {Processor} {Arrays}}, volume = {17}, year = {2017} } @inproceedings{faucris.122511224, abstract = {In this paper, we present a first solution to the unsolved problem of joint tiling and scheduling a given loop nest with uniform data dependencies symbolically. This problem arises for loop programs for which the iterations shall be optimally scheduled on a processor array of unknown size at compile-time. Still, we show that it is possible to derive parameterized latency-optimal schedules statically by proposing two new program transformations: In the first step, the iteration space is tiled symbolically into orthotopes of parametrized extensions. The resulting tiled program is subsequently scheduled symbolically. Here, we show that the maximal number of potential optimal schedules is upper bounded by 2ⁿ n! where n is the dimension of the loop nest. However, the real number of optimal schedule candidates being much less than this. At run-time, once the size of the processor array becomes known, simple comparisons of latency-determining expressions finally steer which of these schedules will be dynamically activated and the corresponding program configuration executed on the resulting processor array so to avoid any further run-time optimization or expensive recompilations. © 2013 IEEE.}, address = {New York, NY, USA}, author = {Teich, Jürgen and Tanase, Alexandru-Petru and Hannig, Frank}, booktitle = {Proc. 24th International Conference on Application-Specific Systems, Architectures and Processors}, date = {2013-06-05/2013-06-07}, doi = {10.1109/ASAP.2013.6567543}, faupublication = {yes}, isbn = {978-1-4799-0493-8}, note = {UnivIS-Import:2015-04-16:Pub.2013.tech.IMMD.inform.symbol}, pages = {1-9}, publisher = {Institute of Electrical and Electronics Engineers}, title = {{Symbolic} {Parallelization} of {Loop} {Programs} for {Massively} {Parallel} {Processor} {Arrays}}, venue = {Washington, DC}, year = {2013} } @book{faucris.113807804, author = {Tanase, Alexandru-Petru and Hannig, Frank and Teich, Jürgen}, doi = {10.1007/978-3-319-73909-0}, faupublication = {yes}, isbn = {978-3-319-73908-3}, peerreviewed = {unknown}, publisher = {Springer}, title = {{Symbolic} {Parallelization} of {Nested} {Loop} {Programs}}, year = {2018} } @inproceedings{faucris.117123644, abstract = {In this paper, we propose a quasi-static scheduling (QSS) method applicable to actor-oriented SystemC designs. QSS determines a schedule where several static schedules are combined in a dynamic schedule to reduce runtime overhead. This is done by performing as much static scheduling as possible at compile time, and only treating data-dependent control flow as runtime decision. Our approach improves known quasi-static approaches in a way that it is directly applicable to real world designs, and has less restrictions on the underlying model. The effectiveness of the approach based on symbolic computation is demonstrated by scheduling a SystemC design of a network packet filter. © 2008 IEEE.}, author = {Gladigau, Jens and Haubelt, Christian and Teich, Jürgen}, booktitle = {Proceedings of Forum on specification & Design Languages 2008 (FDL08)}, doi = {10.1109/FDL.2008.4641412}, faupublication = {yes}, isbn = {9781424422654}, pages = {1-6}, peerreviewed = {unknown}, title = {{Symbolic} quasi-static scheduling of actor-oriented {SystemC} models}, venue = {Stuttgart}, year = {2008} } @inproceedings{faucris.109564444, abstract = {Increasing reliability at a minimum amount of extra cost is a major challenge in todays ECU network design. Considering reliability as an objective already in early design phases has the potential to avoid expensive modifications in later design phases. Hence, there is a need for an appropriate optimization process and efficient analysis techniques to evaluate the found implementations. In this paper, we will show how symbolic techniques can be used to efficiently analyze and optimize such reliable systems. The contribution of this paper is (1) a symbolic reliability analysis that makes use of a partitioned structure function and (2) a symbolic optimization process based on binary ILP solvers. Our case study from the automotive area will show a significant speed-up using our analysis technique. Moreover, our optimization approach is able to offer implementations with considerably improved reliability at no additional costs as well as implementations with reduced costs without decreasing their reliability. © 2008 EDA}, author = {Glaß, Michael and Lukasiewycz, Martin and Reimann, Felix and Haubelt, Christian and Teich, Jürgen}, booktitle = {Proceedings of Design, Automation and Test in Europe (DATE 2008)}, date = {2008-04-10/2008-04-14}, doi = {10.1109/DATE.2008.4484679}, faupublication = {yes}, isbn = {9789783981089}, pages = {158-163}, peerreviewed = {unknown}, title = {{Symbolic} reliability analysis and optimization of {ECU} networks}, venue = {Munich}, year = {2008} } @inproceedings{faucris.117131784, abstract = {In recent years, several network online algorithms have been studied that exhibit self-x properties such as self-healing or self-adaption. These properties are used to improve systems characteristics like, e.g., fault-tolerance, reliability, or load-balancing. In this paper, a symbolic reliability analysis of self-healing networked embedded systems that rely on self-reconfiguration and self-routing is presented. The proposed analysis technique respects resource constraints such as the maximum computational load or the maximum memory size, and calculates the achievable reliability of a given system. This analytical approach considers the topology of the system, the properties of the resources, and the executed applications. Moreover, it is independent of the used online algorithms that implement the self-healing properties, but determines the achievable upper bound for the systems reliability. Since this analysis is not tailored to a specific online algorithm, it allows a reasonable decision making on the used algorithm by enabling a rating of different self-healing strategies. Experimental results show the effectiveness of the introduced technique even for large networked embedded systems. © 2008 Springer-Verlag Berlin Heidelberg.}, author = {Glaß, Michael and Lukasiewycz, Martin and Reimann, Felix and Haubelt, Christian and Teich, Jürgen}, booktitle = {Proceedings of the 27th International Conference on Computer Safety, Reliability and Security (SAFECOMP 2008)}, date = {2008-09-22/2008-09-25}, doi = {10.1007/978-3-540-87698-4{\_}14}, faupublication = {yes}, isbn = {9783540876977}, pages = {139-152}, peerreviewed = {unknown}, title = {{Symbolic} reliability analysis of self-healing networked embedded systems}, venue = {Newcastle upon Tyne}, year = {2008} } @incollection{faucris.109565984, abstract = {In this chapter, we propose a quasi-static scheduling (QSS) method applicable to SystemC dataflow designs. QSS determines a schedule where several static schedules are combined in a dynamic schedule. This, among others, reduces runtime overhead. QSS is done by performing as much static scheduling as possible at compile time, and only treating data-dependent control flow as runtime decision. Our approach improves known quasi-static approaches in a way that it is automatically applicable to real world designs, and has less restrictions on the underlying model. The effectiveness of the approach based on symbolic computation is demonstrated by scheduling a SystemC design of a network packet filter. © 2009 Springer Science+Business Media B.V.}, author = {Gladigau, Jens and Haubelt, Christian and Teich, Jürgen}, booktitle = {Languages for Embedded Systems and their Applications}, doi = {10.1007/978-1-4020-9714-0{\_}12}, editor = {M. Radetzki}, faupublication = {yes}, isbn = {9781402097133}, keywords = {Quasi-static; Software scheduling; Symbolic methods; SystemC}, pages = {183-199}, peerreviewed = {unknown}, publisher = {Springer}, series = {Lecture Notes in Electrical Engineering}, title = {{Symbolic} scheduling of {SystemC} dataflow designs}, volume = {36}, year = {2009} } @article{faucris.119684664, abstract = {Modern embedded systems provide a variety of functionality as operational modes, each corresponding to a mutually exclusive phase of operation. This paper provides a system level design methodology tailored for such multi-mode systems. By incorporating knowledge about the temporal behavior, it is possible to share hardware by means of partial reconfiguration on sophisticated Field Programmable Gate Arrays (FPGAs), and thus, reduce costs and improve performance. The presented methodology is based on an exploration model, which specifies the temporal behavior of the system functionality as well as the architectural characteristics of nowadays reconfigurable technology. We develop a symbolic encoding of this system specification, which enables unified system synthesis by applying sophisticated optimization techniques to perform allocation, binding, placement of partially reconfigurable modules, and routing the on-chip communication. The presented system-level design methodology complies with the state-of-the-art synthesis tools and communication technologies for partially reconfigurable systems. We demonstrate this by experiments on test cases from the image processing domain applying state-of-the-art technology. The results give evidence of the efficiency of the methodology and show the superiority in terms of runtime and quality of the found solutions compared to existing system-level synthesis approaches. © 2012 Springer Science+Business Media New York.}, author = {Wildermann, Stefan and Reimann, Felix and Ziener, Daniel and Teich, Jürgen}, doi = {10.1007/s10617-012-9102-1}, faupublication = {yes}, journal = {Design Automation For Embedded Systems}, keywords = {Design space exploration; Field-programmable gate arrays; Partial reconfiguration; System level design}, note = {UnivIS-Import:2015-03-09:Pub.2012.tech.IMMD.inform.symbol{\_}0}, pages = {1-33}, peerreviewed = {Yes}, title = {{Symbolic} {System}-level {Design} {Methodology} for {Multi}-{Mode} {Reconfigurable} {Systems}}, year = {2012} } @inproceedings{faucris.117146084, abstract = {More and more embedded systems provide a multitude of services, implemented by a large number of networked hardware components. In early design phases, dimensioning such complex systems in terms of monetary costs, power consumption, reliability etc. demands for new analysis approaches at the electronic system level. In this paper, two symbolic system level reliability analysis approaches are Introduced. First, a formal approach based on Binary Decision Diagrams is presented that allows to calculate exact reliability measures for small to moderatesized systems. Second, a simulative approach is presented that hybridizes a Monte Carlo simulation with a SAT solver and delivers adequate approximations of the reliability measures for large and complex systems. ©2010 IEEE.}, author = {Glaß, Michael and Lukasiewycz, Martin and Reimann, Felix and Haubelt, Christian and Teich, Jürgen}, booktitle = {Proceedings of the International Conference on Computer-Aided Design (ICCAD)}, date = {2010-11-07/2010-11-11}, doi = {10.1109/ICCAD.2010.5654134}, faupublication = {yes}, isbn = {9781424481927}, pages = {185-189}, peerreviewed = {unknown}, title = {{Symbolic} system level reliability analysis}, venue = {San Jose, CA}, year = {2010} } @inproceedings{faucris.121643544, abstract = {Stringent real-time constraints lead to complex search spaces containing only very few or even no valid implementations. Hence, while searching for a valid implementation a substantial amount of time is spent on timing analysis during system synthesis. This paper presents a novel system synthesis approach that efficiently prunes the search space in case real-time constraints are violated. For this purpose, the reason for a constraint violation is analyzed and a deduced encoding removes it permanently from the search space. Thus, the approach is capable of proving both the presence and absence of a correct implementation. The key benefit of the proposed approach stems from its integral support for real-time constraint checking. Its efficiency, however, results from the power of deduction techniques of state-of-the-art Boolean Satisfiability (SAT) solvers. Using a case study from the automotive domain, experiments show that the proposed system synthesis approach is able to find valid implementations where former approaches fail. Moreover, it is up to two orders of magnitude faster compared to a state-of-the-art approach. © 2011 ACM.}, author = {Reimann, Felix and Lukasiewycz, Martin and Glaß, Michael and Haubelt, Christian and Teich, Jürgen}, booktitle = {2011 48th ACM/EDAC/IEEE Design Automation Conference, DAC 2011}, date = {2011-06-05/2011-06-10}, faupublication = {yes}, isbn = {9781450306362}, keywords = {Algorithms; Design}, pages = {393-398}, peerreviewed = {unknown}, title = {{Symbolic} system synthesis in the presence of stringent real-time constraints}, url = {https://www.scopus.com/inward/record.url?partnerID=HzOxMe3b&scp=80052678758&origin=inward}, venue = {San Diego, CA}, year = {2011} } @inproceedings{faucris.121350724, abstract = {This paper presents a system synthesis approach for dependable embedded systems. The proposed approach significantly extends previous work by automatically inserting fault detection and fault toleration mechanisms into an implementation. The main contributions of this paper are 1) a dependability-aware system synthesis approach that automatically performs a redundant task binding and placement of voting structures to increase both, reliability and safety, respectively, 2) an efficient dependability analysis approach to evaluate lifetime reliability and safety, and 3) results from synthesizing a Motion-JPEG decoder for an FPGA platform using the proposed system synthesis approach. As a result, a set of high-quality solutions of the decoder with maximized reliability, safety, performance, and simultaneously minimized resource requirements is achieved. Copyright 2008 ACM.}, author = {Reimann, Felix and Glaß, Michael and Lukasiewycz, Martin and Keinert, Joachim and Haubelt, Christian and Teich, Jürgen}, booktitle = {Proceedings of the 6th International Conference on Hardware/Software Codesign and System Synthesis (CODES+ISSS)}, date = {2008-10-19/2008-10-24}, doi = {10.1145/1450135.1450190}, faupublication = {yes}, isbn = {9781605584706}, pages = {237-242}, peerreviewed = {unknown}, title = {{Symbolic} voter placement for dependability-aware system synthesis}, venue = {Atlanta, GA}, year = {2008} } @inproceedings{faucris.123136024, author = {Gladigau, Jens and Blendinger, Frank and Haubelt, Christian and Teich, Jürgen}, booktitle = {11. GI/ITG/GMM-Workshop Methoden und Beschreibungssprachen zur Modellierung und Verifikation von Schaltungen und Systemen}, date = {2008-03-03/2008-03-05}, faupublication = {no}, pages = {109-118}, peerreviewed = {unknown}, title = {{Symbolische} {Modellprüfung} {Aktor}-orientierter {High}-level {SystemC}-{Modelle} mit {Intervalldiagrammen}}, venue = {Freiburg}, year = {2008} } @article{faucris.122238204, author = {Schwarzer, Tobias and Weichslgartner, Andreas and Glaß, Michael and Wildermann, Stefan and Brand, Peter and Teich, Jürgen}, doi = {10.1109/TCAD.2017.2695894}, faupublication = {yes}, journal = {IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems}, pages = {297-310}, peerreviewed = {Yes}, title = {{Symmetry}-eliminating {Design} {Space} {Exploration} for {Hybrid} {Application} {Mapping} on {Many}-{Core} {Architectures}}, volume = {37}, year = {2018} } @misc{faucris.107885184, author = {Falk, Joachim and Haubelt, Christian and Teich, Jürgen}, faupublication = {yes}, peerreviewed = {automatic}, title = {{Syntax} and execution behavior of {SysteMoC}}, year = {2005} } @inproceedings{faucris.121328944, author = {Glaß, Michael and Lukasiewycz, Martin and Haubelt, Christian and Streichert, Thilo and Teich, Jürgen}, booktitle = {Proceedings of Zuverlässigkeit und Entwurf (ZuD 2007)}, date = {2007-03-26/2007-03-28}, faupublication = {yes}, pages = {141-148}, peerreviewed = {unknown}, title = {{Synthese} zuverlässiger und flexibler {Systeme}}, venue = {Munich}, year = {2007} } @misc{faucris.117057864, author = {Teich, Jürgen}, faupublication = {no}, peerreviewed = {automatic}, title = {{Synthesis} and {Optimization} of {Digital} {Hardware}/{Software} {Systems}}, year = {1996} } @incollection{faucris.122901064, author = {Teich, Jürgen}, booktitle = {In System Design Automation}, editor = {R. Merker and W. Schwarz}, faupublication = {no}, note = {UnivIS-Import:2015-04-20:Pub.2001.tech.IMMD.inform.synthe{\_}7}, pages = {3-26}, peerreviewed = {unknown}, publisher = {Kluwer Academic Publishers}, title = {{Synthesis} and {Optimization} of {Digital} {Hardware}/{Software} {Systems}}, year = {2001} } @article{faucris.114462084, abstract = {In the domain of image processing, often real-time constraints are required. In particular, in safety-critical applications, timing is of utmost importance. A common approach to maintain real-time capabilities is to offload computations to dedicated hardware accelerators, such as Field Programmable Gate Arrays (FPGAs). Designing such architectures is per se already a challenging task, but finding the right design point between achieving as much throughput as necessary while spending as few resources as possible is an even bigger challenge. To address this design challenge in the domain of image processing, several approaches have been presented that introduce an additional layer of abstraction between the developer and the actual target hardware. One approach is to use a Domain-Specific Language (DSL) to generate highly optimized code for synthesis by general purpose High-Level Synthesis (HLS) frameworks. Another approach is to instantiate a generic VHDL IP-Core library for local imaging operators. Elevating the description of image algorithms to such a higher abstraction level can significantly reduce the complexity for designing hardware accelerators targeting FPGAs. We provide a comparison of results for both approaches, a non-expert algorithm developer can achieve. Furthermore, we present an automatic optimization process to give the algorithm developer even more control over trading execution time for resource usage, that could be applied on top of both approaches. To evaluate our optimization procedure, we compare the resulting FPGA accelerators to highly optimized Graphics Processing Unit (GPU) implementations of several image filters relevant for close-to-sensor image and video processing with stringent real-time constraints, such as in the automotive domain.}, author = {Reiche, Oliver and Häublein, Konrad and Reichenbach, Marc and Schmid, Moritz and Hannig, Frank and Teich, Jürgen and Fey, Dietmar}, doi = {10.1016/j.sysarc.2015.09.004}, faupublication = {yes}, journal = {Journal of Systems Architecture}, keywords = {Hardware Accelerators; Image Processing; Synthesis; Code Generation}, note = {UnivIS-Import:2016-02-10:Pub.2015.tech.IMMD.IMMD3.{\_}synth}, pages = {646-658}, peerreviewed = {Yes}, title = {{Synthesis} and {Optimization} of {Image} {Processing} {Accelerators} using {Domain} {Knowledge}}, url = {https://www12.cs.fau.de/downloads/reiche/publications/RHRSHTF15.pdf}, volume = {61}, year = {2015} } @inproceedings{faucris.108355104, author = {Teich, Jürgen and Bednara, Marcus}, booktitle = {Proc. of the First International Conference on Engineering of Reconfigurable Systems and Algorithms}, date = {2001-06-25/2001-06-28}, faupublication = {no}, note = {UnivIS-Import:2015-04-16:Pub.2001.tech.IMMD.inform.synthe}, pages = {1-7}, title = {{Synthesis} of {FPGA} {Implementations} from {Loop} {Algorithms}}, venue = {Las Vegas, Nevada}, year = {2001} } @inproceedings{faucris.109575224, abstract = {Due to increasing complexity of modern real-time image processing applications, classical hardware development at register transfer level becomes more and more the bottleneck of technological progress. Modeling those applications by help of multi-dimensional data flow and providing efficient means for their synthesis in hardware is one possibility to alleviate the situation. The key element of such descriptions is a multi-dimensional FIFO whose hardware synthesis shall be investigated in this paper. In particular, it considers the occurring out-of-order communication and proposes an architecture which is able to handle both address generation and flow control in an efficient manner. The resulting implementation allows reading and writing one pixel per clock cycle with an operation frequency of up to 300 MHz. This is even sufficient to process very huge images occurring in the domain of digital cinema in real-time. © 2008 Springer-Verlag Berlin Heidelberg.}, author = {Keinert, Joachim and Haubelt, Christian and Teich, Jürgen}, booktitle = {Proceedings of the International Conference on Architecture of Computing Systems (ARCS 2008)}, date = {2008-02-25/2008-02-28}, doi = {10.1007/978-3-540-78153-0{\_}11}, faupublication = {yes}, isbn = {9783540781523}, pages = {130-143}, peerreviewed = {unknown}, title = {{Synthesis} of multi-dimensional high-speed {FIFOs} for out-of-order communication}, venue = {Dresden}, year = {2008} } @inproceedings{faucris.216171721, abstract = {Programming heterogeneous platforms to achieve high performance is laborious since writing efficient code requires tuning at a low level with architecture-specific optimizations and is based on drastically differing programming models. Performance portability across different platforms can be achieved by decoupling the algorithm description from the target implementation. We present Hipacc (http://hipacc-lang.org), a framework consisting of an open-source image processing DSL and a compiler to target CPUs, GPUs, and FPGAs from the same program. We demonstrate Hipacc’s productivity by considering real-world computer vision applications, e.g., optical flow, and generating target code (C++, OpenCL, C-based HLS) for three platforms (CPU and GPU in a laptop and an FPGA board). Finally, we showcase the real-time processing of images acquired by a USB camera on these platform}, author = {Özkan, Mehmet Akif and Reiche, Oliver and Qiao, Bo and Membarth, Richard and Teich, Jürgen and Hannig, Frank}, booktitle = {Demo at the University Booth at Design, Automation and Test in Europe (DATE)}, date = {2019-03-25/2019-03-29}, faupublication = {yes}, peerreviewed = {unknown}, title = {{Synthesizing} {High}-{Performance} {Image} {Processing} {Applications} with {Hipacc}}, url = {https://www12.cs.fau.de/downloads/oezkan/publications/date-ubooth19.pdf}, venue = {Florence}, year = {2019} } @inproceedings{faucris.116251344, abstract = {In order to accurately predict the behavior of micro-electronic systems with nodal based software tools like SPICE, it is necessary to know appropriate equivalent circuits of the systems of interest. In microwave and RF engineering, this equivalent networks often have to be derived form measurement or EM field calculation via scattering parameters. In this contribution, a methodology is suggested that combines evolutionary algorithms (EAs) with a nodal based assembly technique in order to synthesize passive equivalent networks. Using only the knowledge of the scattering parameters, both structure and component values of an equivalent circuit of a system are determined by EA. © 2003 IEEE.}, author = {Kralicek, Peter and John, Werner and Reinhold, Christian and Teich, Jürgen}, booktitle = {Proceedings of the Congress on Evolutionary Computation (CEC'03)}, date = {2003-12-08/2003-12-12}, doi = {10.1109/CEC.2003.1299883}, faupublication = {yes}, note = {UnivIS-Import:2015-04-16:Pub.2003.tech.IMMD.inform.synthe{\_}6}, pages = {1740-1747}, publisher = {IEEE Computer Society}, series = {Synthesizing Passive Networks by applying Genetic Programming and Evolution Strategies}, title = {{Synthesizing} {Passive} {Networks} by applying {Genetic} {Programming} and {Evolution} {Strategies}}, venue = {Canberra}, year = {2003} } @article{faucris.122199264, author = {Teich, Jürgen and Thiele, Lothar}, doi = {10.1515/FREQ.1990.44.3-4.122}, faupublication = {no}, journal = {Frequenz}, pages = {122-132}, peerreviewed = {Yes}, title = {{Systematic} design concepts for signal processing arrays (invited paper)}, volume = {44}, year = {1990} } @article{faucris.116299964, abstract = {Application-specific, parameterized local search algorithms (PLSAs), in which optimization accuracy can be traded off with run time, arise naturally in many optimization contexts. We introduce a novel approach, called simulated heating, for systematically integrating parameterized local search into evolutionary algorithms (EAs). Using the framework of simulated heating, we investigate both static and dynamic strategies for systematically managing the tradeoff between PLSA accuracy and optimization effort. Our goal is to achieve maximum solution quality within a fixed optimization time budget. We show that the simulated heating technique better utilizes the given optimization time resources than standard hybrid methods that employ fixed parameters, and that the technique is less sensitive to these parameter settings. We apply this framework to three different optimization problems, compare our results to the standard hybrid methods, and show quantitatively that careful management of this tradeoff is necessary to achieve the full potential of an EA/PLSA combination.}, author = {Bambha, Neil and Bhattacharyya, Shuvra S. and Zitzler, Eckart and Teich, Jürgen}, doi = {10.1109/TEVC.2004.823471}, faupublication = {yes}, journal = {IEEE Transactions on Evolutionary Computation}, keywords = {Evolutionary algorithm (EA); Hybrid global/local search}, note = {UnivIS-Import:2015-03-09:Pub.2004.tech.IMMD.inform.system{\_}1}, pages = {137-155}, peerreviewed = {Yes}, title = {{Systematic} {Integration} of {Parameterized} {Local} {Search} {Into} {Evolutionary} {Algorithms}}, volume = {8}, year = {2004} } @inproceedings{faucris.122829564, abstract = {Application-specific, parameterized local search algorithms (PLSAs), in which optimization accuracy can be traded off with run-time, arise naturally in many optimization contexts. We introduce a novel approach, called simulated heating, for systematically integrating parameterized local search into evolutionary algorithms (EAs). Using the framework of simulated heating, we investigate both static and dynamic strategies for systematically managing the trade-off between PLSA accuracy and optimization effort. Our goal is to achieve maximum solution quality within a fixed optimization time budget. We show that the simulated heating technique better utilizes the given optimization time resources than standard hybrid methods that employ fixed parameters, and that the technique is less sensitive to these parameter settings. We demonstrate our techniques on the well-known binary knapsack problem and two problems in electronic design automation. We compare our results to the standard hybrid methods, and show quantitatively that careful management of this trade-off is necessary to achieve the full potential of an EA/PLSA combination. © Springer-Verlag Berlin Heidelberg 2004.}, address = {Berlin, Heidelberg}, author = {Bambha, Neil and Bhattacharyya, Shuvra S. and Teich, Jürgen and Zitzler, Eckart}, booktitle = {Proceedings of the Genetic and Evolutionary Computation Conference}, date = {2004-06-26/2004-06-30}, faupublication = {yes}, isbn = {3-540-22344-4}, note = {UnivIS-Import:2015-04-16:Pub.2004.tech.IMMD.inform.system}, pages = {383-384}, publisher = {Springer-verlag}, title = {{Systematic} {Integration} of {Parameterized} {Local} {Search} {Techniques} in {Evolutionary} {Algorithms}}, venue = {Seattle, Washington}, volume = {3102}, year = {2004} } @article{faucris.121048224, abstract = {With increasing design complexity, the gap from ESL (Electronic System Level) design to RTL synthesis becomes more and more crucial to many industrial projects. Although several behavioral synthesis tools exist to automatically generate synthesizable RTL code from C/C based input descriptions and software generation for embedded processors is automated as well, an efficient ESL synthesis methodology combining both is still missing. This article presents SystemCoDesigner, a novel SystemC-based ESL tool to automatically optimize a hardware/software SoC (System on Chip) implementation with respect to several objectives. Starting from a SystemC behavioral model, SystemCoDesigner automatically extracts the mathematical model, performs a behavioral synthesis step, and explores the multiobjective design space using state-of-the-art multiobjective optimization algorithms. During design space exploration, a single design point is evaluated by simulating highly accurate performance models, which are automatically generated from the SystemC behavioral model and the behavioral synthesis results. Moreover, SystemCoDesigner permits the automatic generation of bit streams for FPGA targets from any previously optimized SoC implementation. Thus SystemCoDesigner is the first fully automated ESL synthesis tool providing a correct-by- construction generation of hardware/software SoC implementations. As a case study, a model of a Motion-JPEG decoder was automatically optimized and implemented using SystemCoDesigner. Several synthesized SoC variants based on this model show different tradeoffs between required hardware costs and achieved system throughput, ranging from software-only solutions to pure hardware implementations that reach real-time performance for QCIF streams on a 50MHz FPGA. © 2009 ACM.}, author = {Keinert, Joachim and Streubühr, Martin and Schlichter, Thomas and Falk, Joachim and Gladigau, Jens and Teich, Jürgen and Haubelt, Christian and Meredith, Michael}, doi = {10.1145/1455229.1455230}, faupublication = {yes}, journal = {ACM Transactions on Design Automation of Electronic Systems}, keywords = {Hardware/software codesign; System design}, note = {UnivIS-Import:2015-04-14:Pub.2009.tech.IMMD.inform.system}, pages = {1-23}, peerreviewed = {Yes}, title = {{SYSTEMCODESIGNER} - {An} {Automatic} {ESL} {Synthesis} {Approach} by {Design} {Space} {Exploration} and {Behavioral} {Synthesis} for {Streaming} {Applications}}, volume = {14}, year = {2009} } @inproceedings{faucris.109579184, abstract = {With the term flexibility, we introduce a new design dimension of an embedded system that quantitatively characterizes its feasibility in implementing not only one, but possibly several alternative behaviors. This is important when designing systems that may adapt their behavior during operation, e.g., due to new environmental conditions, or when dimensioning a platform-based system that must implement a set of different behaviors. A hierarchical graph model is introduced that allows us to model flexibility and cost of a system formally. Based on this model, an efficient exploration algorithm to find the optimal flexibility/cost-tradeoff-curve of a system using the example of the design of a family of set-top boxes is proposed. © 2002 IEEE.}, author = {Haubelt, Christian and Teich, Jürgen and Richter, Kai and Ernst, Rolf}, booktitle = {Proc. DATE 2002, Design, Automation and Test in Europe}, doi = {10.1109/DATE.2002.998399}, faupublication = {no}, pages = {854-861}, peerreviewed = {unknown}, title = {{System} design for flexibility}, venue = {Paris}, year = {2002} } @inproceedings{faucris.118710284, abstract = {As data locality is a key factor for the acceleration of loop programs on processor arrays, we propose a buffer architecture that can be configured at run-time to select between different schemes for memory access. In addition to traditional address-based memory banks, the buffer architecture can deliver data in a streaming manner to the processing elements of the array, which supports dense and sparse stencil operations. Moreover, to minimize data transfers to the buffers, the design contains an interlinked mode, which is especially targeted at 2-D kernel computations. The buffers can be used individually to achieve high data throughput by utilizing a maximum number of I/O channels to the array, or concatenated to provide higher storage capacity at a reduced amount of I/O channels. Copyright 2013 ACM.}, address = {New York, NY, USA}, author = {Hannig, Frank and Schmid, Moritz and Lari, Vahid and Boppu, Srinivas and Teich, Jürgen}, booktitle = {Proc. ACM International Conference on Computing Frontiers}, date = {2013-05-14/2013-05-16}, doi = {10.1145/2482767.2482770}, faupublication = {yes}, isbn = {978-1-4503-2053-5}, keywords = {Processor array; Reconfigurable buffer; System integration}, note = {UnivIS-Import:2015-04-16:Pub.2013.tech.IMMD.inform.system}, pages = {1-4}, publisher = {ACM Press}, title = {{System} {Integration} of {Tightly}-{Coupled} {Processor} {Arrays} using {Reconfigurable} {Buffer} {Structures}}, venue = {Ischia}, year = {2013} } @inproceedings{faucris.118311644, abstract = {This paper studies the loosely integration of application accelerators consisting of an array of tightly-coupled lightweight reconfigurable processors into a system-on-a-chip. In order to explore a multitude of design variations a C++ simulation model of the accelerator has been integrated with a system-on-a-chip environment consisting of a general purpose processor, a DMA controller, an interrupt controller and a memory module. Dependent on the applications, different kinds of I/O buffers are designed around the processor array and the effects of the buffer size on the overall execution time are evaluated. The evaluations are based on new mathematical estimation models derived from the system and application constraints. The estimations are validated with experimental results with an error less than 1%. Exploring several designs points that using our architecture along with suitable buffer sizes, can improve the system execution time, one to two magnitudes for the selected algorithms. © 2009 IEEE.}, author = {Lari, Vahid and Hannig, Frank and Teich, Jürgen}, booktitle = {Proceedings of the 4th International Symposium on Embedded Multicore Systems-on-Chip}, date = {2009-09-22/2009-09-25}, doi = {10.1109/ICPPW.2009.72}, faupublication = {yes}, keywords = {Coarse-grained reconfigurable architectures; Double buffering mechanism; System performance evaluation; System-on-a-chip; Virtual system prototyping}, note = {UnivIS-Import:2015-04-16:Pub.2009.tech.IMMD.inform.system{\_}5}, pages = {528-534}, title = {{System} {Integration} of {Tightly}-{Coupled} {Reconfigurable} {Processor} {Arrays} and {Evaluation} of {Buffer} {Size} {Effects} on {Their} {Performance}}, venue = {Vienna}, year = {2009} } @inproceedings{faucris.246716421, author = {Pourmohseni, Behnaz and Teich, Jürgen}, booktitle = {PhD Forum at the Design, Automation, and Test in Europe (DATE) Conference and Exhibition}, date = {2020-03-09/2020-03-13}, faupublication = {yes}, pages = {1-2}, peerreviewed = {unknown}, title = {{System}-{Level} {Mapping}, {Analysis}, and {Management} of {Real}-{Time} {Applications} in {Many}-{Core} {Systems}}, url = {https://www12.cs.fau.de/downloads/pourmohseni/pub/phdForumDATE20.pdf}, venue = {Grenoble, France}, year = {2020} } @inproceedings{faucris.117421304, author = {Streubühr, Martin and Riedel, Christian and Haubelt, Christian and Teich, Jürgen}, booktitle = {Proceedings of 10. Workshop "Methoden und Beschreibungssprachen zur Modellierung und Verifikation von Schaltungen und Systemen"}, date = {2007-03-05/2007-03-07}, faupublication = {yes}, pages = {59-68}, peerreviewed = {unknown}, title = {{System} {Level} {Modeling} and {Performance} {Simulation} for {Dynamic} {Reconfigurable} {Computing} {Systems} in {SystemC}}, venue = {Erlangen}, year = {2007} } @inproceedings{faucris.118577624, address = {Kissingen, Germany}, author = {Zhang, Liyuan and Streubühr, Martin and Glaß, Michael and Teich, Jürgen and von Schwerin, Andreas and Liu, Kai}, booktitle = {Proc. of the Embedded World Conference}, date = {2012-02-28/2013-03-01}, faupublication = {yes}, note = {UnivIS-Import:2015-04-16:Pub.2012.tech.IMMD.inform.system}, publisher = {WEKA Fachzeitschriften Verlag}, title = {{System}-{Level} {Modeling} and {Simulation} of {Networked} {PROFINET} {IO} {Controllers}}, venue = {Nuremberg}, year = {2012} } @inproceedings{faucris.109925464, author = {Streubühr, Martin and Haubelt, Christian and Teich, Jürgen}, booktitle = {1st HiPEAC Workshop on Rapid Simulation and Performance Evaluation: Methods and Tools (RAPIDO), in conjunction with the 4th HiPEAC Conference}, date = {2009-01-25/2009-01-25}, faupublication = {yes}, pages = {47-52}, peerreviewed = {unknown}, title = {{System} {Level} {Performance} {Simulation} for {Heterogeneous} {Multi}-{Processor} {Architectures}}, venue = {Paphos}, year = {2009} } @inproceedings{faucris.118113864, abstract = {Safety-critical systems rely on redundancy schemes such as k-out-of-n structures which enable tolerance against multiple faults. These techniques are subject to Imperfect Fault Coverage (IFC) as error detection and recovery might be prone to errors or even impossible for certain fault models. As a result, these techniques may act as single points of failure in the system where uncovered faults might be overlooked and lead to wrong system outputs. Neglecting IFC in reliability analysis may lead to fatal overestimations in case of safety-critical applications. Yet, existing techniques that do consider IFC are overly pessimistic in assuming that the occurrence of an uncovered fault always results in a system failure. But often, in particular in complex systems with nested redundant structures, a fault that is not noticed by an inner redundancy scheme might be caught by an outer redundancy scheme. This paper proposes to automatically incorporate IFC into reliability models, i. e. Binary Decision Diagrams (BDDs), to enable an accurate reliability analysis for complex system structures including nested redundancies and repeated components. It also shows that IFC does not equally affect different redundancy schemes. Experimental results presented for applications in multimedia and automotive confirm that the proposed approach can analyze system reliability more accurately at an acceptable execution time and memory overhead compared to the underlying IFC-unaware technique.}, author = {Khosravi, Faramarz and Aliee, Hananeh and Teich, Jürgen}, booktitle = {15th IEEE/ACM Symposium on Embedded Systems for Real-Time Multimedia (ESTIMedia)}, date = {2017-10-15/2017-10-20}, doi = {10.1145/3139315.3141787}, faupublication = {yes}, isbn = {978-1-4503-5117-1/17/10}, keywords = {Reliability, redundancy, imperfect fault coverage, binary decision diagrams}, pages = {68-77}, peerreviewed = {unknown}, title = {{System}-{Level} {Reliability} {Analysis} {Considering} {Imperfect} {Fault} {Coverage}}, venue = {Seoul, Republic of Korea}, year = {2017} } @inproceedings{faucris.118577844, author = {Wildermann, Stefan and Reimann, Felix and Ziener, Daniel and Teich, Jürgen}, booktitle = {Proc. of the Workshop on Self-Awareness in Reconfigurable Computing Systems (SRCS)}, date = {2012-09-01/2012-09-01}, faupublication = {yes}, note = {UnivIS-Import:2015-04-16:Pub.2012.tech.IMMD.inform.system{\_}8}, pages = {4-7}, title = {{System} {Level} {Synthesis} {Flow} for {Self}-adaptive {Multi}-mode {Reconfigurable} {Systems}}, venue = {Oslo}, year = {2012} } @article{faucris.110706464, abstract = {In this paper, we consider system-level synthesis as the problem of optimally mapping a task-level specification onto a heterogeneous hardware/software architecture. This problem requires (1) the selection of the architecture (allocation) including general purpose and dedicated processors, ASICs, busses and memories, (2) the mapping of the specification onto the selected architecture in space (binding) and time (scheduling), and (3) the design space exploration with the goal to find a set of implementations that satisfy a number of constraints on cost and performance. Existing methodologies often consider a fixed architecture, perform the binding only, do not reflect the tight interdependency between binding and scheduling, do not consider communication (tasks and resources), or require long run-times preventing design space exploration, or yield only one implementation with optimal cost. Here, a model is introduced that handles all mentioned requirements and allows the task of system-synthesis to be specified as an optimization problem. The application and adaptation of an Evolutionary Algorithm to solve the tasks of optimization and design space exploration is described.}, author = {Blickle, Tobias and Teich, Jürgen and Thiele, Lothar}, faupublication = {no}, journal = {Design Automation For Embedded Systems}, keywords = {Design space exploration; Evolutionary algorithms; Hardware/software partitioning; System-synthesis}, note = {UnivIS-Import:2015-03-05:Pub.1998.tech.IMMD.inform.system}, pages = {23-58}, peerreviewed = {Yes}, title = {{System}-{Level} {Synthesis} {Using} {Evolutionary} {Algorithms}}, volume = {3}, year = {1998} } @misc{faucris.124085544, author = {Teich, Jürgen and Thiele, Lothar}, faupublication = {no}, peerreviewed = {automatic}, title = {{System}-level synthesis using evolutionary algorithms}, year = {1996} } @incollection{faucris.112588124, abstract = {Computations in hardware/software systems are inherently performed concurrently.
Hence, modeling hardware/software systems requires notions of concurrency.
Data-flow models have been and are still successfully applied in the modeling of hardware/software systems.
In this chapter, we motivate and introduce the usage of data-flow models.
Moreover, we discuss the expressiveness and analyzability of different data-flow Models of Computation (MoCs).
Subsequently, we present SysteMoC, an approach supporting many data-flow MoCs based on the system description language SystemC.
Besides specifying data-flow models, SystemMoC also permits the automatic classification of each different part of an application modeled in SysteMoC into a least expressive but most analyzable MoC.
This classification is the key to further optimization in later design stages of hardware/software systems, such as exploration of design alternatives as well as automatic code generation and hardware synthesis.
Such optimization and refinement steps are employed as part of the extsc{SystemCoDesigner} design flow that uses SysteMoC as its input languag}, address = {Dordrecht, The Netherlands}, author = {Falk, Joachim and Haubelt, Christian and Teich, Jürgen and Zebelein, Christian}, booktitle = {Handbook of Hardware/Software Codesign}, editor = {Ha S, Teich J}, faupublication = {yes}, isbn = {978-94-017-7266-2}, keywords = {dataflow, SysteMoC, SystemCoDesigner}, pages = {59 - 97}, peerreviewed = {Yes}, publisher = {Springer}, title = {{SysteMoC}: {A} {Data}-{Flow} {Programming} {Language} for {Codesign}}, volume = {1}, year = {2017} } @incollection{faucris.123166384, abstract = {As HPC systems are becoming increasingly heterogeneous and diverse, writing software that attains maximum performance and scalability while remaining portable as well as easily composable is getting more and more challenging. Additionally, code that has been aggressively optimized for certain execution platforms is usually not easily portable to others without either losing a great share of performance or investing many hours by re-applying optimizations. One possible remedy is to exploit the potential given by technologies such as domain-specific languages (DSLs) that provide appropriate abstractions and allow the application of technologies like automatic code generation and auto-tuning. In the domain of geometric multigrid solvers, project ExaStencils follows this road by aiming at providing highly optimized and scalable numerical solvers, specifically tuned for a given application and target platform. Here, we introduce its DSL ExaSlang with data types for local vectors to support computations that use point-local vectors and matrices. These data types allow an intuitive modeling of many physical problems represented by systems of partial differential equations (PDEs), e.g., the simulation of flows that include vector-valued velocitie}, address = {Berlin, Heidelberg, New York}, author = {Schmitt, Christian and Kuckuk, Sebastian and Hannig, Frank and Teich, Jürgen and Köstler, Harald and Rüde, Ulrich and Lengauer, Christian}, booktitle = {Software for Exascale Computing - SPPEXA 2013-2015}, doi = {10.1007/978-3-319-40528-5{\_}3}, faupublication = {yes}, isbn = {9783319405261}, month = {Jan}, note = {UnivIS-Import:2017-01-09:Pub.2016.tech.IMMD.lsinfs.system}, pages = {47-67}, peerreviewed = {Yes}, publisher = {Springer}, series = {Lecture Notes in Computational Science and Engineering}, title = {{Systems} of {Partial} {Differential} {Equations} in {ExaSlang}}, volume = {113}, year = {2016} } @inproceedings{faucris.121781924, address = {Berlin, Germany}, author = {Reimann, Felix and Glaß, Michael and Teich, Jürgen and Abelein, Ulrich}, booktitle = {Proc. Automotive meets Electronics, GMM Fachbericht 75}, date = {2013-02-19/2013-02-20}, faupublication = {yes}, isbn = {978-3-8007-3485-6}, note = {UnivIS-Import:2015-04-16:Pub.2013.tech.IMMD.inform.szenar}, pages = {15-20}, publisher = {VDE Verlag}, title = {{Szenarienbasierte} {Integration} von {Diagnosefunktionalität} in {E}/{E} {Architekturen}}, venue = {Dortmund}, year = {2013} } @inproceedings{faucris.120918424, abstract = {We propose a novel framework, called Virtual Processing Components (VPC), that permits the modeling and simulation of multiple processors running arbitrary scheduling strategies in SystemC. The granularity is given by task accuracy that guarantees a small simulation overhead.}, author = {Streubühr, Martin and Falk, Joachim and Teich, Jürgen and Haubelt, Christian and Dorsch, Rainer and Schlipf, Th}, booktitle = {Proceedings of Design, Automation and Test in Europe (DATE 2006), IEEE Computer Society}, date = {2006-03-06/2006-03-10}, faupublication = {yes}, isbn = {9783981080117}, pages = {480-481}, peerreviewed = {unknown}, title = {{Task}-accurate performance modeling in {SystemC} for real-time multi-processor architectures}, url = {https://www.scopus.com/inward/record.url?partnerID=HzOxMe3b&scp=34047100936&origin=inward}, venue = {Munich}, volume = {1}, year = {2006} } @misc{faucris.123871484, author = {Falk, Joachim and Haubelt, Christian and Teich, Jürgen}, faupublication = {yes}, peerreviewed = {automatic}, title = {{Task} {Graph} {Clustering} with {Internal} {State}}, year = {2007} } @article{faucris.272612829, author = {Pourmohseni, Behnaz and Wildermann, Stefan and Smirnov, Fedor and Meyer, Paul and Teich, Jürgen}, doi = {10.1109/ACCESS.2022.3162617}, faupublication = {yes}, journal = {IEEE Access}, peerreviewed = {Yes}, title = {{Task} {Migration} {Policy} for {Thermal}-{Aware} {Dynamic} {Performance} {Optimization} in {Many}-{Core} {Systems}}, year = {2022} } @inproceedings{faucris.116534704, abstract = {We consider the problem of executing a dynamically changing set of tasks on a reconfigurable system, made upon a processor and a reconfigurable device. Task execution on such a platform is man-aged by a scheduler that can allocate tasks either to the processor or to the reconfigurable device. The scheduler can be seen as part of an operating system running on the software or as core in the reconfigurable device. For each tasks to be executed on reconfigurable device, an equivalent implementation exists as rectangular block in a database. This block has to be placed on the device at run-time. A placer is responsible for the placement of tasks received from the scheduler on the reconfigurable device. However, the placement of tasks on the reconfigurable device will not be succesful if enough space is not available on the device to hold the task. In this case the scheduler receive an acknowledgment from the placer and decide either to preempt a running task or to run the task on software. We present in this work an implementation of a placer module as well as investigations on task preemption. The two modules are part of an operating system for reconfigurable system currently under development.}, author = {Ahmadinia, Ali and Bobda, Christophe and Koch, Dirk and Majer, Mateusz and Teich, Jürgen}, booktitle = {Proceedings of the 17th Symposium on Integrated Circuits and Systems Design (SBCCI)}, date = {2004-09-07/2004-09-11}, faupublication = {yes}, isbn = {1-58113-947-0}, keywords = {FPGA; Hardware Preemption; Partial Reconfiguration; Placement; Reconfigurable Computing; Scheduling}, note = {UnivIS-Import:2015-04-16:Pub.2004.tech.IMMD.inform.tasksc{\_}0}, pages = {22 - 27}, title = {{Task} {Scheduling} for {Heterogeneous} {Reconfigurable} {Computers}}, venue = {Pernambuco}, year = {2004} } @inproceedings{faucris.119659144, author = {Sousa, Éricles and Chakraborty, Arindam and Tanase, Alexandru-Petru and Hannig, Frank and Teich, Jürgen}, booktitle = {Demo Night at the IEEE International Conference on Reconfigurable Computing and FPGAs (ReConFig)}, date = {2017-12-04/2017-12-06}, doi = {10.1109/RECONFIG.2017.8279818}, faupublication = {yes}, peerreviewed = {Yes}, title = {{TCPA} {Editor}: {A} {Design} {Automation} {Environment} for a {Class} of {Coarse}-{Grained} {Reconfigurable} {Arrays}}, url = {http://ieeexplore.ieee.org/document/8279818/}, venue = {Cancun, Mexico}, year = {2017} } @article{faucris.117154004, abstract = {In this paper, we present techniques for providing on-demand structural redundancy for Coarse-Grained Reconfigurable Array (CGRAs) and a calculus for determining the gains of reliability when applying these replication techniques from the perspective of safety-critical parallel loop program applications. Here, for protecting massively parallel loop computations against errors like soft errors, well-known replication schemes such as Dual Modular Redundancy (DMR) and Triple Modular Redundancy (TMR) must be applied to each single Processor Element (PE) rather than one based on application requirements for reliability and Soft Error Rates (SERs). Moreover, different voting options and signal replication schemes are investigated. It will be shown that hardware voting may be accomplished at negligible hardware cost, i. e. less than two percent area overhead per PE, for a class of reconfigurable processor arrays called Tightly Coupled Processor Arrays (TCPAs). As a major contribution of this paper, a formal analysis of the reliability achievable by each combination of replication and voting scheme for parallel loop executions on CGRAs in dependence of a given SER and application timing characteristics (schedule) is elaborated. Using this analysis, error detection latencies may be computed and proper decisions which replication scheme to choose at runtime to guarantee a maximal probability of failure on-demand can be derived. Finally, fault-simulation results are provided and compared with the formal analysis of reliability.}, author = {Teich, Jürgen and Lari, Vahid and Tanase, Alexandru-Petru and Witterauf, Michael and Khosravi, Faramarz and Meyer, Brett}, doi = {10.1016/j.sysarc.2015.10.004}, faupublication = {yes}, journal = {Journal of Systems Architecture}, keywords = {Coarse-grained reconfigurable Architectures; Fault tolerance; Reliability analysis}, pages = {615-627}, peerreviewed = {Yes}, title = {{Techniques} for on-demand structural redundancy for massively parallel processor arrays}, volume = {61}, year = {2015} } @article{faucris.117161704, abstract = {This contribution provides an approach for emulating the behaviour of an ASIC temperature monitoring system (TMon) during run-time for a tightly-coupled processor array (TCPA) of a heterogeneous invasive multi-tile architecture to be used for FPGA prototyping. It is based on a thermal RC modeling approach. Also different usage scenarios of TCPA are analyzed and compared.}, author = {Glocker, E. and Boppu, Srinivas and Chen, Q. and Schlichtmann, U. and Teich, Jürgen and Schmitt-Landsiedel, D.}, doi = {10.5194/ars-12-103-2014}, faupublication = {yes}, journal = {Advances in Radio Science}, pages = {103-109}, peerreviewed = {unknown}, title = {{Temperature} modeling and emulation of an {ASIC} temperature monitor system for {Tightly}-{Coupled} {Processor} {Arrays} ({TCPAs})}, volume = {12}, year = {2014} } @inproceedings{faucris.118036864, author = {Ahmadinia, Ali and Bobda, Christophe and Teich, Jürgen}, booktitle = {Proceedings of the IEEE International Conference on Field-Programmable Technology}, date = {2003-12-15/2003-12-17}, faupublication = {yes}, note = {UnivIS-Import:2015-04-16:Pub.2003.tech.IMMD.inform.tempor}, pages = {359-362}, series = {Temporal Task Clustering for Online Placement on Reconfigurable Hardware}, title = {{Temporal} {Task} {Clustering} for {Online} {Placement} on {Reconfigurable} {Hardware}}, venue = {Tokyo, Japan,}, year = {2003} } @inproceedings{faucris.117397984, author = {Gladigau, Jens and Haubelt, Christian and Streubühr, Martin and Teich, Jürgen and Schneider, Axel and Knäblein, Joachim and Lindig, Michael}, booktitle = {Methoden und Beschreibungssprachen zur Modellierung und Verifikation von Schaltungen und Systemen}, date = {2009-03-02/2009-03-04}, faupublication = {yes}, pages = {157-166}, peerreviewed = {unknown}, title = {{Testfallgenerierung} für {SystemC}-{Designs} mit abstrakten {Modellbeschreibungen}}, venue = {Berlin}, year = {2009} } @inproceedings{faucris.118494684, abstract = {Ethernet is currently being discussed within the automotive community to become a general network technology for interconnecting future distributed automotive systems. If this is the case, a complete tool chain will be necessary to support developers to implement their functions. One important component is the availability of tools to monitor, generate, manipulate, and simulate traffic in distributed embedded systems. Today's established communication technologies like LIN, CAN, or FlexRay have the enormous advantage to be based on a physical shared medium which makes it relatively easy to add an additional test device to a network under test. This paper describes an approach to implement such a test device for switched Ethernet networks, presents performance measurements of our implemented Ethernet-Test-Switch, and introduces a concept to integrate simulated and existing devices with each other. © 2011 IEEE.}, address = {New York, NY, USA}, author = {Kern, Andreas and Zhang, Hongyan and Streichert, Thilo and Teich, Jürgen}, booktitle = {Proceedings of the 6th IEEE International Symposium on Industrial Embedded Systems (SIES'11)}, date = {2011-06-15/2011-06-17}, doi = {10.1109/SIES.2011.5953657}, faupublication = {yes}, keywords = {Ethernet; IP; network simulation; packet generation; packet monitoring; switch; testing; UDP}, note = {UnivIS-Import:2015-04-16:Pub.2011.tech.IMMD.inform.testin}, pages = {150-155}, publisher = {IEEE Press}, title = {{Testing} {Switched} {Ethernet} {Networks} in {Automotive} {Embedded} {Systems}}, venue = {Västeras}, year = {2011} } @inproceedings{faucris.234083931, abstract = {CUDA graph is an asynchronous task-graph programming model recently released by Nvidia. It encapsulates application workflows in a graph, with nodes being operations connected by dependencies. The new API brings two benefits: Reduced work launch overhead and whole workflow optimizations. In this paper, we improve the ability of CUDA graph to exploit workflow optimizations, e.g. concurrent kernel executions with complementary resource occupancy. Additionally, we argue that the advantages of DSLs are complementary to CUDA graph, and joining the two techniques can benefit from the best of both worlds. Here, we propose a compiler-based approach that combines CUDA graph with an image processing DSL and a source-to-source compiler called Hipacc. For ten image processing applications benchmarked on two Nvidia GPUs, our approach is able to achieve a geometric mean speedup of 1.30 over Hipacc without CUDA graph, 1.11 over CUDA graph without Hipacc, and 3.96 over another state-of-the-art DSL called Halid}, author = {Qiao, Bo and Özkan, Mehmet Akif and Teich, Jürgen and Hannig, Frank}, booktitle = {Proceedings of the 57th Annual Design Automation Conference (DAC)}, date = {2020-07-19/2020-07-23}, doi = {10.1109/DAC18072.2020.9218531}, faupublication = {yes}, peerreviewed = {Yes}, publisher = {IEEE}, title = {{The} {Best} of {Both} {Worlds}: {Combining} {CUDA} {Graph} with an {Image} {Processing} {DSL}}, venue = {San Francisco, CA}, year = {2020} } @inproceedings{faucris.120790824, author = {Arzt, Ulrich and Teich, Jürgen and Thiele, Lothar}, booktitle = {Proc. International Symposium on Circuits and Systems}, faupublication = {no}, note = {UnivIS-Import:2015-04-16:Pub.1992.tech.IMMD.inform.thecon}, pages = {681-684}, title = {{The} concepts of {COMPAR}: {A} compiler for massive parallel architectures}, venue = {San Diego, CA, U.S.A.,}, year = {1992} } @article{faucris.117421744, author = {Bobda, Christophe and Majer, Mateusz and Teich, Jürgen and Ahmadinia, Ali}, faupublication = {yes}, journal = {Journal of VLSI Signal Processing Systems for Signal, Image, and Video Technology}, note = {UnivIS-Import:2015-04-14:Pub.2006.tech.IMMD.inform.theerl}, pages = {15-31}, peerreviewed = {unknown}, title = {{The} {Erlangen} {Slot} {Machine}: {A} {Dynamically} {Reconfigurable} {FPGA}-{Based} {Computer}}, volume = {47}, year = {2007} } @inproceedings{faucris.118076464, abstract = {We present a new concept as well as the implementation of an FPGA-based reconfigurable platform, the Erlangen Slot Machine (ESM). The main advantages of this platform are: first, the possibility for each module to access its peripheries independent from its location through a programmable crossbar, and distributed SRAMs among slices. This allows an unrestricted relocation of modules on the device. Second, the intermodule structure allows an unlimited communication among running modules. © 2005 IEEE.}, author = {Ahmadinia, Ali and Bobda, Christophe and Fekete, Sandor P. and Haller, Thomas and Linarth, Andre Guilherme and Majer, Mateusz and Teich, Jürgen and Van Der Veen, Jan C.}, booktitle = {Proceedings of the 13th Annual IEEE Symposium on Field-Programmable Custom Computing Machines (FCCM'05)}, date = {2005-04-18/2005-04-20}, doi = {10.1109/FCCM.2005.63}, faupublication = {yes}, note = {UnivIS-Import:2015-04-16:Pub.2005.tech.IMMD.inform.theerl}, pages = {319-320}, title = {{The} {Erlangen} {Slot} {Machine}: {A} {Highly} {Flexible} {FPGA}-{Based} {Reconfigurable} {Platform}}, venue = {Marriott at Napa Valley, California}, year = {2005} } @article{faucris.111832204, author = {Angermeier, Josef and Göhringer, Diana and Majer, Mateusz and Teich, Jürgen and Fekete, Sandor P. and Van Der Veen, Jan C.}, faupublication = {yes}, journal = {it - Information Technology}, note = {UnivIS-Import:2015-03-09:Pub.2007.tech.IMMD.inform.theerl{\_}8}, pages = {143-148}, peerreviewed = {Yes}, title = {{The} {Erlangen} {Slot} {Machine}: {A} {Platform} for {Interdisciplinary} {Research} in {Reconfigurable} {Computing}}, volume = {49}, year = {2007} } @misc{faucris.112803064, author = {Mattauch, Sandra and Lohmann, Katja and Hannig, Frank and Lohmann, Daniel and Teich, Jürgen}, doi = {10.25593/issn.2191-5008/CS-2018-02}, faupublication = {yes}, peerreviewed = {automatic}, title = {{The} {Gender} {Gap} in {Computer} {Science} --- {A} {Bibliometric} {Analysis}}, year = {2018} } @book{faucris.108908404, author = {Ha, Soonhoi and Teich, Jürgen}, doi = {10.1007/978-94-017-7267-9{\_}4}, editor = {Ha S, Teich J}, faupublication = {yes}, peerreviewed = {unknown}, publisher = {Springer}, title = {{The} {Handbook} of {Hardware}/{Software} {Codesign}}, volume = {1}, year = {2017} } @article{faucris.263479105, author = {Alhaddad, Samer and Förstner, Jens and Groth, Stefan and Grünewald, Daniel and Grynko, Yevgen and Hannig, Frank and Kenter, Tobias and Pfreundt, Franz-Josef and Plessl, Christian and Schotte, Merlind and Steinke, Thomas and Teich, Jürgen and Weiser, Martin and Wende, Florian}, doi = {10.1002/cpe.6616}, faupublication = {yes}, journal = {Concurrency and Computation-Practice & Experience}, peerreviewed = {Yes}, title = {{The} {HighPerMeshes} {Framework} for {Numerical} {Algorithms} on {Unstructured} {Grids}}, year = {2021} } @inproceedings{faucris.118778044, author = {Weichslgartner, Andreas and Teich, Jürgen}, booktitle = {Proc. of the first International Workshop on Multi-Objective Many-Core Design (MOMAC) in conjunction with International Conference on Architecture of Computing Systems (ARCS)}, faupublication = {yes}, note = {UnivIS-Import:2015-04-17:Pub.2014.tech.IMMD.inform.theinv}, pages = {1-8}, title = {{The} {Invasive} {Network} on {Chip} - {A} {Multi}-{Objective} {Many}-{Core} {Communication} {Infrastructure}}, venue = {Lübeck}, year = {2014} } @misc{faucris.117405684, author = {Wildermann, Stefan and Teich, Jürgen}, faupublication = {yes}, peerreviewed = {automatic}, title = {{Theoretical} {Analysis} of {Fair} {Bandwidth} {Sharing} in {Priority}-based {Medium} {Access}}, year = {2008} } @inproceedings{faucris.118203184, author = {Hannig, Frank and Ruckdeschel, Holger and Teich, Jürgen}, booktitle = {Proceedings of the GI/ITG/GMM-Workshop -- Methoden und Beschreibungssprachen zur Modellierung und Verifikation von Schaltungen und Systemen}, date = {2008-03-03/2008-03-05}, faupublication = {yes}, note = {UnivIS-Import:2015-04-16:Pub.2008.tech.IMMD.inform.thepau}, pages = {?-?}, title = {{The} {PAULA} {Language} for {Designing} {Multi}-{Dimensional} {Dataflow}-{Intensive} {Applications}}, venue = {Freiburg}, year = {2008} } @inproceedings{faucris.224490668, author = {Pourmohseni, Behnaz and Smirnov, Fedor and Khdr, Heba and Wildermann, Stefan and Teich, Jürgen and Henkel, Jörg}, booktitle = {Proceedings of the 40th IEEE Real-Time Systems Symposium (RTSS)}, date = {2019-12-03/2019-12-06}, doi = {10.1109/RTSS46320.2019.00029}, faupublication = {yes}, pages = {1-13}, peerreviewed = {Yes}, title = {{Thermally} {Composable} {Hybrid} {Application} {Mapping} for {Real}-{Time} {Applications} in {Heterogeneous} {Many}-{Core} {Systems}}, venue = {Hong Kong}, year = {2019} } @inproceedings{faucris.123475704, abstract = {In this paper, the influence of ε-dominance on multi-objective particle swarm optimization (MOPSO) methods is studied. The most important role of ε-dominance is to bound the number of non-dominated solutions stored in the archive (archive size), which has influences on computational time, convergence and diversity of solutions. Here, ε-dominance is compared with the existing clustering technique for fixing the archive size and the solutions are compared in terms of computational time, convergence and diversity. A new diversity metric is also suggested. The results show that the ε-dominance method can find solutions much faster than the clustering technique with comparable and even in some cases better convergence and diversity. © 2003 IEEE.}, author = {Mostaghim, Sanaz and Teich, Jürgen}, booktitle = {Proceedings of the Congress on Evolutionary Computation (CEC'03)}, date = {2003-12-08/2003-12-12}, doi = {10.1109/CEC.2003.1299886}, faupublication = {yes}, note = {UnivIS-Import:2015-04-16:Pub.2003.tech.IMMD.inform.therol}, pages = {1764-1771}, peerreviewed = {unknown}, publisher = {IEEE Computer Society}, series = {The role of e-dominance in Multi-Objective Particle Swarm Optimization Methods}, title = {{The} role of e-dominance in {Multi}-{Objective} {Particle} {Swarm} {Optimization} {Methods}}, venue = {Canberra}, year = {2003} } @inproceedings{faucris.287773905, address = {Germany}, author = {Letras, Martin and Falk, Joachim and Teich, Jürgen}, booktitle = {Fourth Workshop on Next Generation Real-Time Embedded Systems (NG-RES 2023)}, date = {2023-01-18/2023-01-18}, doi = {10.4230/OASIcs.NG-RES.2023.6}, faupublication = {yes}, month = {Jan}, peerreviewed = {Yes}, title = {{Throughput} and {Memory} {Optimization} for {Parallel} {Implementations} of {Dataflow} {Networks} using {Multi}-{Reader} {Buffers}}, url = {https://drops.dagstuhl.de/opus/volltexte/2023/17737/}, venue = {Toulouse}, year = {2023} } @inproceedings{faucris.117918944, abstract = {Application modeling using dynamic dataflow graphs is wellsuited for multi-core platforms. However, there is often a mismatch between the fine granularity of the application and the platform. Tailoring this granularity to the platform promises performance gains by (a) reducing dynamic scheduling overhead and (b) exploiting compiler optimizations. In this paper, we propose a throughput-optimizing compilation approach that uses Quasi-Static Schedules (QSSs) to combine actors of static dataflow subgraphs. Our proposed approach combines core allocation, QSSs, and actor binding in a Design Space Exploration (DSE), optimizing the throughput for a number of available cores. During the DSE, each implementation candidate is compiled to and evaluated on the target hardware-here an Intel i7 and an ARM Cortex-A9. Experimental results including synthetic benchmarks as well as a real-world control application show that our proposed holistic compilation approach outperforms classic DSEs that are agnostic of QSS as well as a DSE that employs QSS as a post-processing step. Amongst others, we show a case where the compilation approach obtains a speedup of 9.91 x for a 4-core implementation, while a classic DSE only obtains a speedup of 2.12 x.}, author = {Schwarzer, Tobias and Falk, Joachim and Glaß, Michael and Teich, Jürgen and Zebelein, Christian and Haubelt, Christian}, booktitle = {Proceedings of the 18th International Workshop on Software and Compilers for Embedded Systems (SCOPES)}, doi = {10.1145/2764967.2764972}, faupublication = {yes}, isbn = {9781450335935}, keywords = {Clustering; Compilation; Dataflow; Design space exploration; Multi-core}, pages = {68-75}, peerreviewed = {unknown}, publisher = {Association for Computing Machinery, Inc}, title = {{Throughput}-optimizing compilation of dataflow applications for multi-cores using quasi-static scheduling}, venue = {St. Goar}, year = {2015} } @article{faucris.109345764, author = {Mitra, Tulika and Teich, Jürgen and Thiele, Lothar}, doi = {10.1109/MDAT.2018.2794204}, faupublication = {yes}, journal = {IEEE Design and Test of Computers}, pages = {8-26}, peerreviewed = {Yes}, title = {{Time}-{Critical} {Systems} {Design}: {A} {Survey}}, volume = {35}, year = {2018} } @inproceedings{faucris.108964724, author = {Gangadharan, Deepak and Tanase, Alexandru-Petru and Hannig, Frank and Teich, Jürgen}, booktitle = {DATE Friday Workshop on Performance, Power and Predictability of Many-Core Embedded Systems (3PMCES)}, date = {2014-03-28/2014-03-28}, faupublication = {yes}, peerreviewed = {unknown}, title = {{Timing} {Analysis} of a {Heterogeneous} {Architecture} with {Massively} {Parallel} {Processor} {Arrays}}, url = {https://ecsi.org/resource/workshop/2014/3PMCES/DATE/paper/timing-analysis-heterogeneous-architecture-massively-parallel-processor-arrays}, venue = {Dresden, Germany}, year = {2014} } @inproceedings{faucris.118710504, abstract = {Due to ever-increasing bandwidth requirements of modern automotive applications, Ethernet AVB is becoming a standard high-speed bus in automotive E/E architectures. Since Ethernet AVB is tailored to audio and video entertainment, existing analysis approaches neglect the specific requirements and features of heterogeneous E/E architectures and their applications. This paper presents a timing analysis technique based on Real Time Calculus to consider Ethernet AVB in complex E/E architectures, reflecting key features such as static routing and stream reservation, fixed topology, and real-time applications. A comparison with a simulation on case studies from the automotive domain gives evidence that the proposed technique delivers valuable bounds for complete sensor-to-actuator chains, enabling automatic system synthesis and design space exploration approaches. © 2013 IEEE.}, address = {Red Hook, NY, USA}, author = {Reimann, Felix and Graf, Sebastian and Streit, Fabian and Glaß, Michael and Teich, Jürgen}, booktitle = {Proc. IEEE International Conference on Emerging Technology & Factory Automation}, date = {2013-09-10/2013-09-13}, doi = {10.1109/ETFA.2013.6648024}, faupublication = {yes}, isbn = {9781479908622}, note = {UnivIS-Import:2015-04-16:Pub.2013.tech.IMMD.inform.timing}, pages = {1-8}, publisher = {Curran Associates}, title = {{Timing} {Analysis} of {Ethernet} {AVB}-based {Automotive} {E}/{E} {Architectures}}, url = {http://www.etfa2013.org}, venue = {Cagliari}, year = {2013} } @misc{faucris.110015664, author = {Teich, Jürgen}, faupublication = {yes}, peerreviewed = {automatic}, title = {{Timing} {Analysis} of {Systems} of {Communicating} {Tasks} with {Internal} {State}}, year = {2006} } @inproceedings{faucris.117739424, address = {Berlin, Heidelberg, New York}, author = {Teich, Jürgen and Kaxiras, Stefanos and Plaks, Toomas and Flautner, Krisztián}, booktitle = {Proceedings of the 12th International Euro-Par Conference}, date = {2006-08-28/2006-09-01}, doi = {10.1007/11823285{\_}124}, faupublication = {yes}, isbn = {978-3-540-37783-2}, note = {UnivIS-Import:2015-04-16:Pub.2006.tech.IMMD.inform.topic1}, pages = {1179}, publisher = {Springer-verlag}, series = {Lecture Notes in Computer Science (LNCS)}, title = {{Topic} 18: {Embedded} {Parallel} {Systems}}, venue = {Dresden}, year = {2006} } @inproceedings{faucris.106486864, abstract = {Application details uncertain at design time as well as tolerance against permanent resource defects demand flexibility and redundancy. In this context, we present a strategy for placing replicas in embedded point-to-point networks where link as well as node defects may occur at runtime. The proposed strategies for replica placement are based on the partitioning of the network into biconnected components. We are able to distinguish between different replication strategies, i.e., active and passive replication. Our experimental results show that the reliability improvement due to the proposed replica placement strategies is up to 23% compared to a randomized strategy. © 2008 Springer-Verlag Berlin Heidelberg.}, author = {Streichert, Thilo and Glaß, Michael and Wanka, Rolf and Haubelt, Christian and Teich, Jürgen}, booktitle = {Proc. 21st International Conference on Architecture of Computing Systems (ARCS)}, date = {2008-02-25/2008-02-28}, doi = {10.1007/978-3-540-78153-0{\_}4}, faupublication = {yes}, pages = {23-37}, title = {{Topology}-aware replica placement in fault-tolerant embedded networks}, url = {http://www12.cs.fau.de/people/rwanka/publications/SGWHT08.php}, venue = {Dresden}, year = {2008} } @misc{faucris.317125174, abstract = {Convolutional Neural Networks (CNNs) are widely employed to solve various problems, e.g., image classification. Due to their compute- and data-intensive nature, CNN accelerators have been developed as ASICs or on FPGAs. Increasing complexity of applications has caused resource costs and energy requirements of these accelerators to grow. Spiking Neural Networks (SNNs) are an emerging alternative to CNN implementations, promising higher resource and energy efficiency. The main research question addressed in this paper is whether SNN accelerators truly meet these expectations of reduced energy requirements compared to their CNN equivalents. For this purpose, we analyze multiple SNN hardware accelerators for FPGAs regarding performance and energy efficiency. We present a novel encoding scheme of spike event queues and a novel memory organization technique to improve SNN energy efficiency further. Both techniques have been integrated into a state-of-the-art SNN architecture and evaluated for MNIST, SVHN, and CIFAR-10 datasets and corresponding network architectures on two differently sized modern FPGA platforms. For small-scale benchmarks such as MNIST, SNN designs provide rather no or little latency and energy efficiency advantages over corresponding CNN implementations. For more complex benchmarks such as SVHN and CIFAR-10, the trend reverses}, author = {Plagwitz, Patrick and Hannig, Frank and Teich, Jürgen and Keszöcze, Oliver}, faupublication = {yes}, peerreviewed = {automatic}, title = {{To} {Spike} or {Not} to {Spike}? {A} {Quantitative} {Comparison} of {SNN} and {CNN} {FPGA} {Implementations}}, url = {https://arxiv.org/abs/2306.12742}, year = {2023} } @inproceedings{faucris.122616164, author = {Roloff, Sascha and Hannig, Frank and Teich, Jürgen}, booktitle = {Proc. of the first International Workshop on Multi-Objective Many-Core Design (MOMAC) in conjunction with International Conference on Architecture of Computing Systems (ARCS)}, faupublication = {yes}, note = {UnivIS-Import:2015-04-17:Pub.2014.tech.IMMD.inform.toward{\_}7}, pages = {1-2}, title = {{Towards} {Actor}-oriented {Programming} on {PGAS}-based {Multicore} {Architectures}}, venue = {Lübeck}, year = {2014} } @article{faucris.114101504, abstract = {High Performance Computing (HPC) systems are nowadays more and more heterogeneous. Different processor types can be found on a single node including accelerators such as Graphics Processing Units (GPUs). To cope with the challenge of programming such complex systems, this work presents a domain-specific approach to automatically generate code tailored to different processor types. Low-level CUDA and OpenCL code is generated from a high-level description of an algorithm specified in a Domain-Specific Language (DSL) instead of writing hand-tuned code for GPU accelerators. The DSL is part of the Heterogeneous Image Processing Acceleration (HIPA^cc) framework and was extended in this work to handle grid hierarchies in order to model different cycle types. Language constructs are introduced to process and represent data at different resolutions. This allows to describe image processing algorithms that work on image pyramids as well as multigrid methods in the stencil domain. By decoupling the algorithm from its schedule, the proposed approach allows to generate efficient stencil code implementations. Our results show that similar performance compared to hand-tuned codes can be achieve}, author = {Membarth, Richard and Reiche, Oliver and Schmitt, Christian and Hannig, Frank and Teich, Jürgen and Stürmer, Markus and Köstler, Harald}, doi = {10.1016/j.jpdc.2014.08.008}, faupublication = {yes}, journal = {Journal of Parallel and Distributed Computing}, keywords = {Multigrid; Multiresolution; Image pyramid; Domain-specific language; Stencil codes; Code generation; GPU; CUDA; OpenCL}, note = {UnivIS-Import:2015-03-11:Pub.2014.tech.IMMD.inform.toward{\_}67}, pages = {3191-3201}, peerreviewed = {Yes}, title = {{Towards} a {Performance}-portable {Description} of {Geometric} {Multigrid} {Algorithms} using a {Domain}-specific {Language}}, volume = {74}, year = {2014} } @inproceedings{faucris.108186804, abstract = {High Performance Computing (HPC) systems are nowadays more and more heterogeneous. Different processor types can be found on a single node including accelerators such as Graphics Processing Units (GPUs). To cope with the challenge of programming such complex systems, this work presents a domain-specific approach to automatically generate code tailored to different processor types. Low-level CUDA and OpenCL is generated from a high-level description of a geometric multigrid algorithm written in a Domain-Specific Language (DSL) instead of writing hand-tuned code for GPU accelerators. By decoupling the algorithm from its schedule, the proposed approach allows to generate efficient stencil codes. Our results show that competitive performance compared to hand-tuned codes can be achieved and that more than 25 frames per second for 16.8 Megapixel images are obtained for full High Dynamic Range (HDR) compression of 2D medical data sets. © 2012 IEEE.}, author = {Membarth, Richard and Hannig, Frank and Teich, Jürgen and Köstler, Harald}, booktitle = {Proceedings of the 2nd International Workshop on Domain-Specific Languages and High-Level Frameworks for High Performance Computing (WOLFHPC)}, date = {2012-11-10/2012-11-16}, doi = {10.1109/SC.Companion.2012.136}, faupublication = {yes}, keywords = {code generation; CUDA; domain-specific language; GPU; OpenCL; stencil codes}, note = {UnivIS-Import:2015-04-16:Pub.2012.tech.IMMD.inform.toward}, pages = {1-6}, title = {{Towards} {Domain}-specific {Computing} for {Stencil} {Codes} in {HPC}}, venue = {Salt Lake City, UT}, year = {2012} } @inproceedings{faucris.123721224, abstract = {In this paper, we apply a new programming paradigm called resource-aware programming to the Single-Chip Cloud Computer. According to this paradigm, an application may change at certain points of execution its allocation of resources. This gives application engineers the opportunity to dynamically adapt an algorithm's behavior and parallelism to the work load and state of the underlying resources (e. g., availability, clock frequency, temperature). Resource-aware programming can provide a selforganizing behavior to conventional programs for being able to not only tolerate certain types of faults and cope with feature variations, but also to provide scalability, higher resource utilization, as well as performance and power gains by managing voltage/frequency islands and adjusting the amount of allocated resources to the temporal needs of a running application. We discuss the details of resource-aware programming as well as three alternative implementation concepts that we intend to evaluate. Finally, we present the results of initial experiments, we conducted using a centralized resource management framework on the Single- Chip Cloud Computer.}, address = {Karlsruhe, Germany}, author = {Kouveli, Georgia and Hannig, Frank and Lupp, Jan-Hugo and Teich, Jürgen}, booktitle = {Proceedings of the 3rd MARC Symposium}, date = {2011-07-05/2011-07-06}, doi = {10.5445/KSP/1000023937}, faupublication = {yes}, isbn = {978-3-86644-717-2}, note = {UnivIS-Import:2015-04-16:Pub.2011.tech.IMMD.inform.toward{\_}0}, pages = {111-114}, publisher = {KIT Scientific Publishing}, title = {{Towards} {Resource}-{Aware} {Programming} on {Intel}'s {Single}-{Chip} {Cloud} {Computer} {Processor}}, venue = {Ettlingen}, year = {2011} } @inproceedings{faucris.122615944, abstract = {Symbolic encoding for resource allocation, task binding, and message routing during multi-objective design space exploration (DSE) has gained significant attention in recent years. To determine the message routing, existing symbolic approaches typically rely on an explicit encoding of routing hops which results in a huge number of required variables and/or constraints. As a result, these approaches fail in case of large network diameters and/or a huge number of messages or resources and even for smaller problems, the convergence of the involved optimization process suffers. To tackle this shortcomings, this work proposes three novel symbolic routing encoding strategies that all avoid to encode hops explicitly, but are based on an encoding of individual links or complete sender-receiver paths, but still cover the same design space. The result is a more compact problem representation with less constraints and, in particular, less variables; the latter eliminates ineffective degrees of freedom from the search space and significantly enhances the optimization quality of a multi-objective optimization with even non-linear objectives. In an extensive test-suite, three major classes of wired networked embedded systems are considered: (a) hierarchical stars as in MPSoCs or automotive, (b) redundant backbone buses as common in rail systems or avionics, and (c) mesh-based architectures that often occur in NoC-based MPSoCs. For all three classes, the proposed approaches significantly outperform existing techniques in both scalability and optimization quality and, thus, considerably enlarge the field of application of a multi-objective DSE for the networked embedded system design. Copyright is held by the owner/author(s).}, address = {New York, NY, USA}, author = {Graf, Sebastian and Reimann, Felix and Glaß, Michael and Teich, Jürgen}, booktitle = {Proc. of the International Conference on Hardware/Software Codesign and System Synthesis (CODES+ISSS)}, date = {2014-10-12/2014-10-17}, doi = {10.1145/2656075.2656102}, faupublication = {yes}, note = {UnivIS-Import:2015-04-17:Pub.2014.tech.IMMD.inform.toward{\_}5}, pages = {2:1-2:10}, publisher = {IEEE Press}, title = {{Towards} {Scalable} {Symbolic} {Routing} for {Multi}-{Objective} {Networked} {Embedded} {System} {Design} and {Optimization}}, venue = {New Delhi}, year = {2014} } @inproceedings{faucris.117168744, abstract = {State-of-the-art automatic reliability analyses as used in system-level design approaches mainly rely on Binary Decision Diagrams (BDDs) and, thus, face two serious problems: (1) The BDDs exhaust available memory during their construction and/or (2) the final size of the BDDs is, sometimes up to several orders of magnitude, larger than the available memory. The contribution of this paper is twofold: (1) A partitioning-based early quantification technique is presented that aims to keep the size of the BDDs during construction at minimum. (2) A SAT-assisted simulation approach aims to deliver approximated results when exact analysis techniques fail because the final BDDs exhaust available memory. The ability of both methods to accurately analyze larger and more complex systems than known approaches is demonstrated for various test cases. Copyright 2010 ACM.}, author = {Glaß, Michael and Lukasiewycz, Martin and Haubelt, Christian and Teich, Jürgen}, booktitle = {47th Design Automation Conference, DAC '10}, date = {2010-06-13/2010-06-18}, doi = {10.1145/1837274.1837334}, faupublication = {yes}, isbn = {9781450300025}, keywords = {Early quantification; Reliability analysis; SAT-assisted simulation}, pages = {234-239}, peerreviewed = {unknown}, title = {{Towards} scalable system-level reliability analysis}, venue = {Anaheim, CA}, year = {2010} } @inproceedings{faucris.118494904, abstract = {Invasive computing is a novel computing paradigm, which allows us to allocate several resources at run-time. Tightly-coupled processor arrays are well suited for invasive computing. This paper proposes a methodology, to symbolically program a claimed array of computational resources. Using this methodology, a single configuration stream can be derived, which is sufficient to configure all the claimed resources (processing elements) irrespective of the number of resources claimed. The configuration stream is modified at run-time dynamically, depending on the number of processors claimed. Configuration memory requirements were estimated for our methodology. It requires constant memory size and is independent of the problem size compared to a traditional approach. © 2011 IEEE.}, address = {New York, NY, USA}, author = {Boppu, Srinivas and Hannig, Frank and Teich, Jürgen and Pérez-Andrade, Roberto}, booktitle = {Proc. of ReConFig}, date = {2011-11-30/2011-12-02}, doi = {10.1109/ReConFig.2011.91}, faupublication = {yes}, isbn = {978-1-4577-1734-5}, keywords = {coarse grained architectures; dynamic reconfiguration}, note = {UnivIS-Import:2015-04-16:Pub.2011.tech.IMMD.inform.toward}, pages = {392-397}, publisher = {IEEE Press}, title = {{Towards} {Symbolic} {Run}-{Time} {Reconfiguration} in {Tightly}-{Coupled} {Processor} {Arrays}}, url = {http://www.computer.org/portal/web/csdl/doi/10.1109/ReConFig.2011.91}, venue = {Cancun}, year = {2011} } @article{faucris.204074354, author = {Vogel-Heuser, Birgit and Wildermann, Stefan and Teich, Jürgen}, doi = {10.1007/s11740-017-0765-0}, faupublication = {yes}, journal = {Production Engineering}, pages = {687-694}, peerreviewed = {unknown}, title = {{Towards} the co-evolution of industrial products and its production systems by combining models from development and hardware/software deployment in cyber-physical systems}, volume = {11}, year = {2017} } @article{faucris.106895184, author = {Vogel-Heuser, Birgit and Wildermann, Stefan and Teich, Jürgen}, doi = {10.1007/s11740-017-0765-0}, faupublication = {yes}, journal = {Production Engineering}, pages = {687-694}, peerreviewed = {Yes}, title = {{Towards} the {Co}-{Evolution} of {Industrial} {Products} and its {Production} {Systems} by {Combining} {Models} from {Development} and {Hardware}/{Software} {Deployment} in {Cyber}-{Physical} {Systems}}, url = {https://link.springer.com/article/10.1007/s11740-017-0765-0}, volume = {11}, year = {2017} } @inproceedings{faucris.276973594, author = {Plagwitz, Patrick and Hannig, Frank and Teich, Jürgen}, booktitle = {IEEE Proceedings of the 32nd International Conference on Field Programmable Logic and Applications}, date = {2022-08-29/2022-09-02}, doi = {10.1109/FPL57034.2022.00015}, faupublication = {yes}, peerreviewed = {Yes}, title = {{TRAC}: {Compilation}-based {Design} of {Transformer} {Accelerators} for {FPGAs}}, venue = {Belfast, United Kingdom}, year = {2022} } @inproceedings{faucris.121653664, abstract = {Sorting long sequences of keys is a problem that occurs in many different applications. For embedded systems, a uniprocessor software solution is often not applicable due to the low performance, while realizing multiprocessor sorting methods on parallel computers is much too expensive with respect to power consumption, physical weight, and cost. We investigate cost/performance tradeoffs for hybrid sorting algorithms that use a mixture of sequential merge sort and systolic insertion sort techniques. We propose a scalable architecture for integer sorting that consists of a uniprocessor and an FPGA-based parallel systolic co-processor. Speedups obtained analytically and experimentally and depending on hardware (cost) constraints are determined as a function of time constants of the uniprocessor and the co-processor.}, author = {Bednara, Marcus and Beyer, O. and Teich, Jürgen and Wanka, Rolf}, booktitle = {Proc. ASAP'00, the Int. Conf. on Application Specific Systems, Architectures, and Processors, pp. 299-308, Boston, MA, U.S.A. IEEE Computer Society Press, July 2000Proc. ASAP'00, the Int. Conf. on Application Specific Systems, Architectures, and Processors}, date = {2000-07-10/2000-07-12}, faupublication = {no}, note = {UnivIS-Import:2015-04-16:Pub.2000.tech.IMMD.inform.tradeo}, pages = {299-308}, publisher = {IEEE}, title = {{Tradeoff} {Analysis} and {Architecture} {Design} of a {Hybrid} {Hardware}/{Software} {Sorter}}, venue = {Boston, MA}, year = {2000} } @inproceedings{faucris.119108044, abstract = {FPGAs are an attractive platform for elliptic curve cryptography hardware. Since field multiplication is the most critical operation in elliptic curve cryptography, we have studied how efficient several field multipliers can be mapped to lookup table based FPGAs. Furthermore we have compared different curve coordinate representations with respect to the number of required field operations, and show how an elliptic curve coprocessor based on the Montgomery algorithm for curve multiplication can be implemented using our generic coprocessor architecture.}, author = {Bednara, Marcus and Daldrup, M. and J. Shokrollahi, J. and Teich, Jürgen and von zur Gathen, J.}, booktitle = {Proc. of IEEE International Symposium on Circuits and Systems}, date = {2002-05-26/2002-05-29}, faupublication = {no}, note = {UnivIS-Import:2015-04-16:Pub.2002.tech.IMMD.inform.tradeo}, publisher = {Institute of Electrical and Electronics Engineers}, title = {{Tradeoff} {Analysis} of {FPGA} {Based} {Elliptic} {Curve} {Cryptography}}, venue = {Scottsdale, Arizona}, year = {2002} } @inproceedings{faucris.121680724, author = {Boppu, Srinivas and Lari, Vahid and Hannig, Frank and Teich, Jürgen}, booktitle = {Proc. Synopsys Users Group Conference}, date = {2013-05-14/2013-05-14}, faupublication = {yes}, note = {UnivIS-Import:2015-04-16:Pub.2013.tech.IMMD.inform.transa}, pages = {1-15}, title = {{Transactor}-based {Prototyping} of {Heterogeneous} {Multiprocessor} {System}-{On}-{Chip} {Architectures}}, venue = {Munich}, year = {2013} } @incollection{faucris.238939876, author = {Aliee, Hananeh and Glaß, Michael and Khosravi, Faramarz and Teich, Jürgen}, booktitle = {Dependable Embedded Systems}, doi = {10.1007/978-3-030-52017-5{\_}19}, editor = {Henkel, Jörg, Dutt, Nikil}, faupublication = {yes}, isbn = {978-3-030-52017-5}, peerreviewed = {unknown}, title = {{Uncertainty}-{Aware} {Compositional} {System}-{Level} {Reliability} {Analysis}}, year = {2020} } @inproceedings{faucris.121962544, abstract = {Due to manufacturing tolerances and aging effects, future embedded systems have to cope with unreliable components. The intensity of such effects depends on uncertain aspects like environmental or usage conditions such that highly safety-critical systems are pessimistically designed for worst-case mission profiles. In this work, we propose to explicitly model the uncertain characteristics of system components, i. e. we model components using reliability functions with parameters distributed between a best and worst case. Since destructive effects like temperature may affect several components simultaneously (e. g. those in the same package), a correlation between uncertainties of components exists. The proposed uncertainty-aware method combines a formal analysis approach and a Monte Carlo simulation to consider uncertain characteristics and their different correlations. It delivers a holistic view on the system's reliability with best/worst/average-case behavior and also insights on variance and quantiles. But, existing optimization approaches typically assume design objectives to be single values or to follow a predefined distribution. As a remedy, we propose a dominance criterion for meta-heuristic optimization approaches like evolutionary algorithms that enables the comparison of system implementations with arbitrarily distributed characteristics. Our presented experimental results show that (a) the proposed analysis comes at low overhead while capturing existing uncertainties with sufficient accuracy, and (b) the optimization process is significantly enhanced when guiding the search process by additional aspects like variance and the 95% quantile, delivering better system implementations as found by an uncertainty-oblivious optimization approach.}, author = {Khosravi, Faramarz and Müller, Malte and Glaß, Michael and Teich, Jürgen}, booktitle = {Proceedings of Design, Automation and Test in Europe (DATE 2015)}, date = {2015-03-09/2015-03-13}, doi = {10.7873/DATE.2015.0319}, faupublication = {yes}, isbn = {9783981537048}, pages = {97-102}, peerreviewed = {unknown}, publisher = {Institute of Electrical and Electronics Engineers Inc.}, title = {{Uncertainty}-aware reliability analysis and optimization}, venue = {Grenoble}, year = {2015} } @inproceedings{faucris.122431584, author = {Teich, Jürgen and Thiele, Lothar}, booktitle = {In Proc. IEEE Int. Symp. Circuits and Systems (ISCAS)}, faupublication = {no}, pages = {344a-347a}, peerreviewed = {unknown}, title = {{Uniform} design of parallel programs for {DSP}}, venue = {Singapore}, year = {1991} } @inproceedings{faucris.118495564, abstract = {Heterogeneous reconfigurable SoCs provide more flexibility, maintainability, and re-usability than hardwired SoCs. Designing such systems is a complex task, since early decisions, as design partitioning, influence the subsequent design steps, such as placement of partially reconfigurable modules In this paper, we investigate a symbolic design space exploration (DSE) approach for this kind of SoCs, where we transform the problem of finding a feasible implementation to a Boolean satisfiability problem (SAT). We present three encoding variants which unify partitioning and placement to overcome the drawbacks of their separation. In particular, we will show that the runtime of DSE can be speeded up when we perform a preprocessing mechansim that identifies those partitionings which inevitably lead to infeasibility, and then incorporate this information into the symbolic encoding for calculating feasible placements. Our experiments show the effectiveness of our SAT-based approach and compare the presented encoding variants. © 2011 IEEE.}, address = {New York, NY, USA}, author = {Wildermann, Stefan and Ziener, Daniel and Teich, Jürgen}, booktitle = {Proc. of the International Conference on Field Programmable Logic and Applications}, date = {2011-09-05/2011-09-07}, doi = {10.1109/FPL.2011.85}, faupublication = {yes}, isbn = {978-1-4577-1484-9}, note = {UnivIS-Import:2015-04-16:Pub.2011.tech.IMMD.inform.unifyi}, pages = {429-434}, publisher = {IEEE Press}, title = {{Unifying} {Partitioning} and {Placement} for {SAT}-based {Exploration} of {Heterogeneous} {Reconfigurable} {SoCs}}, venue = {Chania, Crete}, year = {2011} } @inproceedings{faucris.118578284, address = {Hamburg, Germany}, author = {Graf, Sebastian and Glaß, Michael and Teich, Jürgen}, booktitle = {Tagungsunterlagen Methoden und Beschreibungssprachen zur Modellierung und Verifikation von Schaltungen und Systemen (MBMV)}, date = {2012-03-05/2012-03-07}, faupublication = {yes}, isbn = {978-1-4673-2295-9}, note = {UnivIS-Import:2015-04-16:Pub.2012.tech.IMMD.inform.unreli}, pages = {13-24}, publisher = {Verlag Dr. Kovac}, title = {{Unreliable} {Data} {Transmissions} and {Limited} {Hardware} {Communication} {Buffers} in {Automotive} {E}/{E} {Virtual} {Prototypes}}, venue = {Kaiserslautern}, year = {2012} } @inproceedings{faucris.232217851, abstract = {Multiresolution filters, analyzing information at different scales, are crucial for many applications in digital image processing. The different space and time complexity at distinct scales in the unique pyramidal structure poses a challenge as well as an opportunity to implementations on modern accelerators such as GPUs with an increasing number of compute units. In this paper, we exploit the potential of concurrent kernel execution in multiresolution filters. As a major contribution, we present a model-based approach for performance analysis of as well single- as multi-stream implementations, combining both application- and architecture-specific knowledge. As a second contribution, the involved transformations and code generators using CUDA streams on Nvidia GPUs have been integrated into a compiler-based approach using an image processing DSL called Hipacc. We then apply our approach to evaluate and compare the achieved performance for four real-world applications on three GPUs. The results show that our method can achieve a geometric mean speedup of up to 2.5 over the original Hipacc implementation without our approach, up to 2.0 over the other state-of-the-art DSL Halide, and up to 1.3 over the recently released programming model CUDA Graph from Nvidia.the modeling of complex dynamic environments or run-time requirements is either not possible or comes at the cost of significant computation overheads or results of lower quality. As a remedy, this paper introduces Learning Optimizer Constrained by ALtering conditions (LOCAL), a novel optimization framework for the optimization of dynamically adaptable embedded systems. Following the structure of Learning Classifier System (LCS) optimizers, the proposed framework optimizes a strategy, i.e., a set of conditionally applicable solutions for the problem at hand, instead of a set of independent solutions. The framework enables the designer to model complex environmental behavior, making this problem-specific knowledge accessible to the optimizer. We show how the proposed framework—which can be used for the optimization of any dynamic system—is used for the optimization of dynamically reconfigurable many-core systems and provide experimental evidence that the hereby obtained strategy offers superior embeddability compared to the solutions provided by a s.o.t.a. hybrid approach which uses an evolutionary algorithm.