init: Beep Boop

2022-01-10 23:50:17 +01:00 · 2022-01-10 23:50:17 +01:00 · 6e15d25210
commit 6e15d25210
23 changed files with 1081 additions and 0 deletions
--- a/.dir-locals.el
+++ b/.dir-locals.el
@ -0,0 +1 @@
+((nil . ((projectile-project-install-cmd . "mvn -B clean compile assembly:single && java -jar target/*.jar"))))
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,4 @@
+target/
+.settings/
+.project
+.classpath
--- a/README.md
+++ b/README.md
@ -0,0 +1,26 @@
+# How to run
+
+Right now, I am to lazy to create an appropriate cli. Therefore, it is that hacky. Sorry...
+
+```sh
+# Device selected by Devices.java (One has to know the selection in advance though...)
+# DEFAULT = 1
+DEVICE=1
+
+# The different targets in App.java
+# 0 = Informational output
+# 10 = All PrimeNumber related implementations at once
+# 20 = All Reduce related implementations at once
+# 21 - 25 = Reduce1 - Reduce5 respectively
+# 30 = All Prefix related implementations at once
+# 31 - 32 = Prefix1 - Prefix2 respectively
+# DEFAULT = 0
+TARGET=31
+
+# How many shifts for N (e.g. if N should be 8, N_LSHIFTS should be 3)
+# Targes 2X and 3X depend on this parameter
+# DEFAULT = 25
+N_LSHIFTS=3
+
+mvn -B clean compile assembly:single && java -jar target/*.jar $DEVICE $TARGET $N_LSHIFTS
+```
--- a/pom.xml
+++ b/pom.xml
@ -0,0 +1,103 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <groupId>edu.thi.phga</groupId>
+    <artifactId>aparapi-test</artifactId>
+    <version>0.0.1-SNAPSHOT</version>
+
+    <name>aparapi-test</name>
+    <!-- FIXME change it to the project's website -->
+    <url>http://www.example.com</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+        <maven.compiler.source>17</maven.compiler.source>
+        <maven.compiler.target>17</maven.compiler.target>
+    </properties>
+
+    <dependencies>
+        <!-- https://mvnrepository.com/artifact/com.aparapi/aparapi -->
+        <dependency>
+            <groupId>com.aparapi</groupId>
+            <artifactId>aparapi</artifactId>
+            <version>3.0.0</version>
+        </dependency>
+
+        <dependency>
+            <groupId>junit</groupId>
+            <artifactId>junit</artifactId>
+            <version>4.11</version>
+            <scope>test</scope>
+        </dependency>
+    </dependencies>
+
+    <build>
+        <pluginManagement><!-- lock down plugins versions to avoid using Maven defaults (may be moved to parent pom) -->
+            <plugins>
+                <!-- clean lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#clean_Lifecycle -->
+                <plugin>
+                    <artifactId>maven-clean-plugin</artifactId>
+                    <version>3.1.0</version>
+                </plugin>
+                <!-- default lifecycle, jar packaging: see https://maven.apache.org/ref/current/maven-core/default-bindings.html#Plugin_bindings_for_jar_packaging -->
+                <plugin>
+                    <artifactId>maven-resources-plugin</artifactId>
+                    <version>3.0.2</version>
+                </plugin>
+                <plugin>
+                    <artifactId>maven-compiler-plugin</artifactId>
+                    <version>3.8.0</version>
+                </plugin>
+                <plugin>
+                    <artifactId>maven-surefire-plugin</artifactId>
+                    <version>2.22.1</version>
+                </plugin>
+                <plugin>
+                    <artifactId>maven-jar-plugin</artifactId>
+                    <configuration>
+                        <archive>
+                            <manifest>
+                                <mainClass>edu.thi.phga.aparapi_test.App</mainClass>
+                            </manifest>
+                        </archive>
+                    </configuration>
+
+                    <version>3.0.2</version>
+                </plugin>
+                <plugin>
+                    <artifactId>maven-assembly-plugin</artifactId>
+                    <configuration>
+                        <archive>
+                            <manifest>
+                                <mainClass>edu.thi.phga.aparapi_test.App</mainClass>
+                            </manifest>
+                        </archive>
+                        <descriptorRefs>
+                            <descriptorRef>jar-with-dependencies</descriptorRef>
+                        </descriptorRefs>
+                    </configuration>
+                </plugin>
+                <plugin>
+                    <artifactId>maven-install-plugin</artifactId>
+                    <version>2.5.2</version>
+                </plugin>
+                <plugin>
+                    <artifactId>maven-deploy-plugin</artifactId>
+                    <version>2.8.2</version>
+                </plugin>
+                <!-- site lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#site_Lifecycle -->
+                <plugin>
+                    <artifactId>maven-site-plugin</artifactId>
+                    <version>3.7.1</version>
+                </plugin>
+                <plugin>
+                    <artifactId>maven-project-info-reports-plugin</artifactId>
+                    <version>3.0.0</version>
+                </plugin>
+            </plugins>
+        </pluginManagement>
+    </build>
+</project>
--- a/src/main/java/edu/thi/phga/aparapi_test/App.java
+++ b/src/main/java/edu/thi/phga/aparapi_test/App.java
@ -0,0 +1,100 @@
+package edu.thi.phga.aparapi_test;
+
+import com.aparapi.device.OpenCLDevice;
+
+public class App {
+  public static int choice;
+  public static OpenCLDevice device;
+  private static void printHeader(final String txt) {
+    final String spacer =
+      "######################################################################";
+    System.out.printf("%s\n%s\n", spacer, txt);
+  }
+  public static void main( String[] args ) {
+    // System.out.println("Running the OpenCL Tasks");
+    if (args.length == 0) {
+      App.choice = 0;
+    } else {
+      App.choice = Integer.parseInt(args[0]) - 1;
+    }
+    App.device = Devices.selectDevice();
+
+    int target = 0;
+    if (args.length > 1) {
+      target = Integer.parseInt(args[1]);
+    }
+    int shift = 25;
+    if (args.length > 2) {
+      shift = Integer.parseInt(args[2]);
+    }
+    final int[] b = new int[1 << shift];
+    switch (target) {
+      case 0:
+        OpenCLGetMemoryInfo.getInfo();
+        OpenCLSizeTest.start(8);
+        break;
+        // PRIMES
+      case 10:
+        printHeader("FindPrimes (Seriell, Parallel, OpenCL)");
+        // Ab 1 << 14 stimmt es nicht mehr
+        // final int[] a = {8, 5, 6, 2, 3, 7, 1, 4};
+        // CPU is faster than GPU...
+        FindPrimes.start();
+        FindPrimesThreads.start();
+        FindPrimesOpenCL.start();
+        break;
+        // REDUCE
+      case 20:
+        java.util.Arrays.fill(b, 1);
+        OpenCLReduce1.start(b);
+        java.util.Arrays.fill(b, 1);
+        OpenCLReduce2.start(b);
+        java.util.Arrays.fill(b, 1);
+        OpenCLReduce3.start(b);
+        java.util.Arrays.fill(b, 1);
+        OpenCLReduce4.start(b);
+        java.util.Arrays.fill(b, 1);
+        OpenCLReduce5.start(b);
+        break;
+      case 21:
+        java.util.Arrays.fill(b, 1);
+        OpenCLReduce1.start(b);
+        break;
+      case 22:
+        java.util.Arrays.fill(b, 1);
+        OpenCLReduce2.start(b);
+        break;
+      case 23:
+        java.util.Arrays.fill(b, 1);
+        OpenCLReduce3.start(b);
+        break;
+      case 24:
+        java.util.Arrays.fill(b, 1);
+        OpenCLReduce4.start(b);
+        break;
+      case 25:
+        java.util.Arrays.fill(b, 1);
+        OpenCLReduce5.start(b);
+        break;
+        // PREFIX
+      case 30:
+        // Up to 1 << 27
+        java.util.Arrays.fill(b, 1);
+        OpenCLPrefix1.start(b);
+        // Up to 1 << 28
+        java.util.Arrays.fill(b, 1);
+        OpenCLPrefix2.start(b);
+        break;
+      case 31:
+        // Up to 1 << 27
+        java.util.Arrays.fill(b, 1);
+        OpenCLPrefix1.start(b);
+        break;
+      case 32:
+        // Up to 1 << 28
+        java.util.Arrays.fill(b, 1);
+        OpenCLPrefix2.start(b);
+        break;
+    }
+  }
+}
--- a/src/main/java/edu/thi/phga/aparapi_test/Devices.java
+++ b/src/main/java/edu/thi/phga/aparapi_test/Devices.java
@ -0,0 +1,49 @@
+package edu.thi.phga.aparapi_test;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Scanner;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import com.aparapi.device.OpenCLDevice;
+import com.aparapi.internal.opencl.OpenCLPlatform;
+
+public class Devices {
+  public static OpenCLDevice selectDevice() {
+    // Informationen über alle verfügbaren OpenCL-Implementierungen ausgeben
+
+    class Processor {                    // Hilfsklasse
+      String name, version, typ;
+      OpenCLDevice device;
+
+      Processor(OpenCLPlatform platform, OpenCLDevice device) {
+        this.device = device;
+        name = platform.getName();
+        Matcher m = Pattern.compile("\\d.\\d").matcher(platform.getVersion());
+        version = m.find() ? m.group() : "";
+        typ = device.getType().name();
+      }
+
+      @Override public String toString() {
+        return typ + " - OpenCL " + version + " - " + name;
+      }
+    }
+
+    // erzeuge alle Platform-Device-Kombinationen
+    List<Processor> processors = new ArrayList<>();
+
+    for (var platform : OpenCLPlatform.getUncachedOpenCLPlatforms())
+      for (var device : platform.getOpenCLDevices())
+        processors.add(new Processor(platform, device));
+
+    for (int i = 0; i < processors.size(); i++)
+      System.out.println(i + 1 + ") " + processors.get(i));
+
+    var processor = processors.get(App.choice);
+    System.out.println("\n" + processor + "\n");
+
+    return processor.device;
+  }
+}
--- a/src/main/java/edu/thi/phga/aparapi_test/FindPrimes.java
+++ b/src/main/java/edu/thi/phga/aparapi_test/FindPrimes.java
@ -0,0 +1,35 @@
+package edu.thi.phga.aparapi_test;
+
+public class FindPrimes {
+  private static final int N = 10_000_000;
+  private static int START = 123_456_789;
+  private static boolean[] istPrime = new boolean[N];
+
+  public static void start() {
+    long t1 = System.nanoTime();
+    primeTest();
+    long t2 = System.nanoTime();
+
+    int a = 0;
+    for (var b : istPrime) {
+      if (b) {
+        a++;
+      }
+    }
+
+    System.out.println("Single: " + (t2 - t1) / 1000000 + " ms: " + a);
+
+  }
+
+  private static void primeTest() {
+    for (int z = START, i = 0; i < N; z += 2, i++) {
+      int teiler = 3;
+
+      while(z > teiler * teiler && z % teiler != 0){
+        teiler += 2;
+      }
+
+      istPrime[i] = z % teiler != 0;
+    }
+  }
+}
--- a/src/main/java/edu/thi/phga/aparapi_test/FindPrimesKernel.java
+++ b/src/main/java/edu/thi/phga/aparapi_test/FindPrimesKernel.java
@ -0,0 +1,25 @@
+package edu.thi.phga.aparapi_test;
+import com.aparapi.Kernel;
+
+public class FindPrimesKernel extends Kernel {
+  private int start;
+  private boolean[] istPrime;
+
+  public FindPrimesKernel(int start, boolean[] istPrime) {
+    this.start = start;
+    this.istPrime = istPrime;
+  }
+
+  @Override public void run() {
+    int index = getGlobalId(0);
+    int zahl = start + index * 2;
+    int teiler = 3;
+
+    while (teiler * teiler < zahl && zahl % teiler != 0) {
+      teiler += 2;
+    }
+
+    istPrime[index] = zahl % teiler != 0;
+
+  }
+}
--- a/src/main/java/edu/thi/phga/aparapi_test/FindPrimesOpenCL.java
+++ b/src/main/java/edu/thi/phga/aparapi_test/FindPrimesOpenCL.java
@ -0,0 +1,28 @@
+package edu.thi.phga.aparapi_test;
+
+import com.aparapi.Kernel;
+import com.aparapi.Range;
+
+public class FindPrimesOpenCL {
+  private static final int N = 10_000_000;
+  private static final int START = 123_456_789;
+  private static boolean[] istPrime = new boolean[N];
+
+  public static void start() {
+    //
+    Range r = Range.create(Devices.selectDevice(), N, 250);
+    Kernel k = new FindPrimesKernel(START, istPrime);
+    k.execute(r);
+
+    k.execute(r);
+
+    int a = 0;
+    for (var b : istPrime) {
+      if (b) {
+        a++;
+      }
+    }
+    System.out.println("OpenCL: " + k.getExecutionTime() + " ms: " + a);
+  }
+
+}
--- a/src/main/java/edu/thi/phga/aparapi_test/FindPrimesThreads.java
+++ b/src/main/java/edu/thi/phga/aparapi_test/FindPrimesThreads.java
@ -0,0 +1,69 @@
+package edu.thi.phga.aparapi_test;
+
+import java.util.List;
+import java.util.concurrent.Callable;
+import java.util.concurrent.Executors;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
+
+public class FindPrimesThreads {
+  private static final int N = 10_000_000;
+  private static final int P = Runtime.getRuntime().availableProcessors();
+  private static int START = 123_456_789;
+  private static boolean[] istPrime = new boolean[N];
+
+  public static void start() {
+    long t1 = System.nanoTime();
+    primeTest();
+    long t2 = System.nanoTime();
+
+    int a = 0;
+    for (var b : istPrime) {
+      if (b) {
+        a++;
+      }
+    }
+
+    System.out.println("Threads(" + P + "): " + (t2 - t1) / 1000000 + " ms: " + a);
+
+  }
+
+  private static void primeTest() {
+    // Taskliste anlegen
+    List<Callable<Object>> tasks = IntStream
+      .range(0, P)
+      .mapToObj(Task::new)
+      .map(Executors::callable)
+      .collect(Collectors.toList());
+
+    try {
+      Executors.newCachedThreadPool().invokeAll(tasks);
+    } catch (InterruptedException e) {
+
+    }
+
+  }
+
+  private static class Task implements Runnable {
+    private static final int SPANNE = N / P;
+    private int index;
+
+    Task(int index) {
+      this.index = index;
+    }
+
+    @Override
+    public void run() {
+      int start = START + index * 2 * SPANNE;
+      for (int z = start, i = 0; i < SPANNE; z += 2, i++) {
+        int teiler = 3;
+
+        while(z > teiler * teiler && z % teiler != 0){
+          teiler += 2;
+        }
+
+        istPrime[index * SPANNE + i] = z % teiler != 0;
+      }
+    }
+  }
+}
--- a/src/main/java/edu/thi/phga/aparapi_test/OpenCLGetMemoryInfo.java
+++ b/src/main/java/edu/thi/phga/aparapi_test/OpenCLGetMemoryInfo.java
@ -0,0 +1,15 @@
+package edu.thi.phga.aparapi_test;
+
+import com.aparapi.device.Device;
+
+public class OpenCLGetMemoryInfo {
+  public static void getInfo() {
+    int localMem = (int) App.device.getLocalMemSize();
+    int maxWGSize = App.device.getMaxWorkGroupSize();
+    int localSize = Math.min(maxWGSize, localMem / 4);
+
+    System.out.printf("Memsize: %d, Max-WGSize: %d, LocalSize: %d\n",
+      localMem, maxWGSize, localSize);
+
+  }
+}
--- a/src/main/java/edu/thi/phga/aparapi_test/OpenCLPrefix1.java
+++ b/src/main/java/edu/thi/phga/aparapi_test/OpenCLPrefix1.java
@ -0,0 +1,77 @@
+package edu.thi.phga.aparapi_test;
+
+import com.aparapi.Kernel;
+import com.aparapi.Range;
+
+
+public class OpenCLPrefix1 {
+
+  public static void start(final int[] a) {
+    int N = a.length;
+    int b[] = new int[N * 2];
+    java.util.Arrays.fill(b, 1);
+    Range r = Range.create(App.device, N, 4);
+    OpenCLPrefix1Kernel k = new OpenCLPrefix1Kernel(b);
+    k.setExplicit(true);
+    k.setStepSize(1);
+    k.execute(r);
+    k.toggleIndex();
+    double ct = k.getConversionTime();
+    // k.get(b);
+    // for (int i = 0; i < b.length; i++) {
+    //   System.out.printf("%c[%d] = %d\n", (i < b.length / 2) ? 'A' : 'B', i, b[i]);
+    // }
+
+    for (int n = 2; n < N; n *= 2) {
+      k.setStepSize(n);
+      k.execute(r);
+      k.toggleIndex();
+      // k.get(b);
+      // for (int i = 0; i < b.length; i++) {
+      //   System.out.printf("%c[%d] = %d\n", (i < b.length / 2) ? 'A' : 'B', i, b[i]);
+      // }
+    }
+
+    k.get(b);
+    double et = k.getAccumulatedExecutionTime() - ct;
+
+    int n = N + k.getIndex();
+    System.out.printf("P1 - GOT: %d, %d, %d, TIME: %.2f ms\n",
+      b[n - 3], b[n - 2], b[n - 1], et);
+  }
+
+  private static class OpenCLPrefix1Kernel extends Kernel {
+    private int stepSize;
+    private int index;
+    private int N;
+    private int[] a;
+
+    public OpenCLPrefix1Kernel(int[] a) {
+      this.a = a;
+      this.N = a.length / 2;
+    }
+
+    public void setStepSize(int s) {
+      this.stepSize = s;
+    }
+
+    public void toggleIndex() {
+      index = N - index;
+    }
+
+    public int getIndex() {
+      return index;
+    }
+
+    @Override
+    public void run() {
+      int i = getGlobalId();
+      int out = N - index;
+      if (i < stepSize) {
+        a[out + i] = a[index + i];
+      } else {
+        a[out + i] = a[index + i] + a[index + i - stepSize];
+      }
+    }
+  }
+}
--- a/src/main/java/edu/thi/phga/aparapi_test/OpenCLPrefix2.java
+++ b/src/main/java/edu/thi/phga/aparapi_test/OpenCLPrefix2.java
@ -0,0 +1,112 @@
+package edu.thi.phga.aparapi_test;
+
+import com.aparapi.Kernel;
+import com.aparapi.Range;
+
+public class OpenCLPrefix2 {
+
+  /**
+     Parallel Prefix implementation with 2 arrays instead
+     of one array twice the size of the requested input.
+
+     Accumulated execution time was equally good.
+     Works up to 1 << 28 before OutOfMemoryError
+
+     Rewrite of the original idea by: Prof. Dr. Schmidt <Ulrich.Schmidt@thi.de>
+  */
+  public static void start(int[] a) {
+    int N = a.length;
+    int b[] = new int[N];
+    java.util.Arrays.fill(b, 1);
+    Range r = Range.create(App.device, N, 4);
+    OpenCLPrefix2Kernel k = new OpenCLPrefix2Kernel(a, b);
+    k.setExplicit(true);
+    k.setStepSize(1);
+    k.execute(r);
+    k.toggleIndex();
+    double ct = k.getConversionTime();
+    // Uncomment to get Debug output (Try with 1 << 3)
+    // k.get(a);
+    // k.get(b);
+    // System.out.println(k.getIndex());
+    // for (int i = 0; i < b.length; i++) {
+    //   System.out.printf("a[%d] = %d\n", i, a[i]);
+    // }
+    // for (int i = 0; i < b.length; i++) {
+    //   System.out.printf("b[%d] = %d\n", i, b[i]);
+    // }
+
+    for (int n = 2; n < N; n *= 2) {
+      k.setStepSize(n);
+      k.execute(r);
+      k.toggleIndex();
+      // Uncomment to get Debug output (Try with 1 << 3)
+      // k.get(a);
+      // k.get(b);
+      // System.out.println(k.getIndex());
+      // for (int i = 0; i < b.length; i++) {
+      //   System.out.printf("a[%d] = %d\n", i, a[i]);
+      // }
+      // for (int i = 0; i < b.length; i++) {
+      //   System.out.printf("b[%d] = %d\n", i, b[i]);
+      // }
+    }
+
+    double et = k.getAccumulatedExecutionTime() - ct;
+
+    // Last write was to A
+    if (k.getIndex() > 0) {
+      k.get(a);
+      // Last write was to B
+    } else {
+      k.get(b);
+      a = b;
+    }
+    System.out.printf("P2 - GOT: %d, %d, %d, TIME: %.2f ms\n",
+      a[N - 3], a[N - 2], a[N - 1], et);
+
+  }
+
+  private static class OpenCLPrefix2Kernel extends Kernel {
+    private int stepSize;
+    private int index = 1;
+    private int[] a, b;
+
+    public OpenCLPrefix2Kernel(int[] a, int[] b) {
+      this.a = a;
+      this.b = b;
+    }
+
+    public void setStepSize(int s) {
+      this.stepSize = s;
+    }
+
+    public void toggleIndex() {
+      index = -index;
+    }
+
+    public int getIndex() {
+      return index;
+    }
+
+    @Override
+    public void run() {
+      int i = getGlobalId();
+      // A -> B
+      if (index > 0) {
+        if (i < stepSize) {
+          b[i] = a[i];
+        } else {
+          b[i] = a[i] + a[i - stepSize];
+        }
+        // B -> A
+      } else {
+        if (i < stepSize) {
+          a[i] = b[i];
+        } else {
+          a[i] = b[i] + b[i - stepSize];
+        }
+      }
+    }
+  }
+}
--- a/src/main/java/edu/thi/phga/aparapi_test/OpenCLReduce1.java
+++ b/src/main/java/edu/thi/phga/aparapi_test/OpenCLReduce1.java
@ -0,0 +1,40 @@
+package edu.thi.phga.aparapi_test;
+
+import com.aparapi.Kernel;
+import com.aparapi.Range;
+
+public class OpenCLReduce1 {
+  public static void start(final int[] a) {
+    int optimalSize = a.length / 2;
+    int[] gs = new int[optimalSize];
+
+    Kernel k = new Kernel() {
+        @Override
+        public void run() {
+          int i = getGlobalId();
+          int size = getGlobalSize();
+
+          gs[i] = size;
+
+          for (int s = 1; s <= size; s *= 2) {
+            if (i % s == 0) {
+              a[2 * i] += a[2 * i + s];
+            }
+          }
+        }
+      };
+    k.execute(Range.create(optimalSize, optimalSize <= 256 ? optimalSize : 256));
+    double ct = k.getConversionTime();
+    double et = k.getExecutionTime() - ct;
+
+      // if (a.length <= 64) {
+      //   for (int x = 0; x < optimalSize; x++) {
+      //     System.out.printf("GI: %d, GSize: %d\n", x, gs[x]);
+      //   }
+      // } else {
+      //     System.out.printf("GI: %d, GSize: %d\n", 0, gs[0]);
+      //     System.out.printf("GI: %d, GSize: %d\n", optimalSize - 1, gs[optimalSize - 1]);
+      // }
+      System.out.printf("R1 - WANT: %d, GOT: %d, TIME: %.2f ms\n", a.length, a[0], et);
+  }
+}
--- a/src/main/java/edu/thi/phga/aparapi_test/OpenCLReduce2.java
+++ b/src/main/java/edu/thi/phga/aparapi_test/OpenCLReduce2.java
@ -0,0 +1,35 @@
+package edu.thi.phga.aparapi_test;
+
+import com.aparapi.Kernel;
+import com.aparapi.Range;
+
+public class OpenCLReduce2 {
+  public static void start(int[] a) {
+    int optimalSize = a.length / 2;
+    int N = a.length;
+    int[] gs = new int[optimalSize];
+
+    Kernel k = new Kernel() {
+        @Override
+        public void run() {
+          int size = N / getGlobalSize(); // 2, 4, 8, ...
+          int i = getGlobalId() * size; // 2: 0 -> 0, 1 -> 2, 2 -> 4, 3 -> 6, ...
+
+          gs[getGlobalId()] = size;
+
+          a[i] += a[i + size / 2]; // 1, 2, 4, ..
+        }
+      };
+
+    k.execute(Range.create(N / 2, optimalSize < 256 ? optimalSize : 256));
+    double ct = k.getConversionTime();
+
+    for (int n = N / 4; n >= 1; n /= 2) {
+      k.execute(Range.create(n, n < 256 ? n : 256));
+    }
+
+    double et = k.getAccumulatedExecutionTime() - ct;
+    System.out.printf("R2 - WANT: %d, GOT: %d, TIME: %.2f ms\n", a.length, a[0], et);
+
+  }
+}
--- a/src/main/java/edu/thi/phga/aparapi_test/OpenCLReduce3.java
+++ b/src/main/java/edu/thi/phga/aparapi_test/OpenCLReduce3.java
@ -0,0 +1,51 @@
+package edu.thi.phga.aparapi_test;
+
+import com.aparapi.Kernel;
+import com.aparapi.Range;
+
+public class OpenCLReduce3 {
+  public static void start(int[] a) {
+    int optimalSize = a.length / 2;
+    int N = a.length;
+    int[] gs = new int[optimalSize];
+
+    Kernel k = new Kernel() {
+        @Override
+        public void run() {
+          int size = N / getGlobalSize(); // 2, 4, 8, ...
+          int i = getGlobalId() * size; // 2: 0 -> 0, 1 -> 2, 2 -> 4, 3 -> 6, ...
+
+          gs[getGlobalId()] = size;
+
+          a[i] += a[i + size / 2]; // 1, 2, 4, ..
+        }
+      };
+
+    // Manually manage variables that are accessible and returned by kernel
+    // Anything that is passed to the constructor is still automatically pushed once
+    // to the Memory on the gpu. This is done to reduce the amount of times
+    // the data has to be transferred to the gpu memory via slow pci bus
+    k.setExplicit(true);
+
+    try {
+      System.out.printf("MAX WG SIZE: %d\n", k.getKernelMaxWorkGroupSize(App.device));
+    } catch(Exception e) {
+
+    }
+    // Not necessary (So maybe anything used by the kernel is put once?)
+    // k.put(a);
+
+    k.execute(Range.create(N / 2, optimalSize < 256 ? optimalSize : 256));
+    double ct = k.getConversionTime();
+
+    for (int n = N / 4; n >= 1; n /= 2) {
+      k.execute(Range.create(n, n < 256 ? n : 256));
+    }
+
+    k.get(a);
+
+    double et = k.getAccumulatedExecutionTime() - ct;
+    System.out.printf("R3 - WANT: %d, GOT: %d, TIME: %.2f ms\n", a.length, a[0], et);
+
+  }
+}
--- a/src/main/java/edu/thi/phga/aparapi_test/OpenCLReduce4.java
+++ b/src/main/java/edu/thi/phga/aparapi_test/OpenCLReduce4.java
@ -0,0 +1,51 @@
+package edu.thi.phga.aparapi_test;
+
+import com.aparapi.Kernel;
+import com.aparapi.Range;
+
+public class OpenCLReduce4 {
+  public static void start(int[] a) {
+    int optimalSize = a.length / 2;
+    int N = a.length;
+    int[] gs = new int[optimalSize];
+
+    Kernel k = new Kernel() {
+        @Override
+        public void run() {
+          int size = getGlobalSize(); // 2, 4, 8, ...
+          int i = getGlobalId(); // 2: 0 -> 0, 1 -> 2, 2 -> 4, 3 -> 6, ...
+
+          gs[getGlobalId()] = size;
+
+          a[i] += a[i + size]; // 1, 2, 4, ..
+        }
+      };
+
+    // Manually manage variables that are accessible and returned by kernel
+    // Anything that is passed to the constructor is still automatically pushed once
+    // to the Memory on the gpu. This is done to reduce the amount of times
+    // the data has to be transferred to the gpu memory via slow pci bus
+    k.setExplicit(true);
+
+    try {
+      System.out.printf("MAX WG SIZE: %d\n", k.getKernelMaxWorkGroupSize(App.device));
+    } catch(Exception e) {
+
+    }
+    // Not necessary (So maybe anything used by the kernel is put once?)
+    // k.put(a);
+
+    k.execute(Range.create(N / 2, optimalSize < 256 ? optimalSize : 256));
+    double ct = k.getConversionTime();
+
+    for (int n = N / 4; n >= 1; n /= 2) {
+      k.execute(Range.create(n, n < 256 ? n : 256));
+    }
+
+    k.get(a);
+
+    double et = k.getAccumulatedExecutionTime() - ct;
+    System.out.printf("R4 - WANT: %d, GOT: %d, TIME: %.2f ms\n", a.length, a[0], et);
+
+  }
+}
--- a/src/main/java/edu/thi/phga/aparapi_test/OpenCLReduce5.java
+++ b/src/main/java/edu/thi/phga/aparapi_test/OpenCLReduce5.java
@ -0,0 +1,52 @@
+package edu.thi.phga.aparapi_test;
+
+import com.aparapi.Range;
+
+public class OpenCLReduce5 {
+  public static void start(int[] a) {
+    int N = a.length;
+
+    int localMem  = (int) App.device.getLocalMemSize();
+    int maxWGSize = Math.min(App.device.getMaxWorkGroupSize(), 256);
+
+    int localSize  = Math.min(maxWGSize, localMem / 4);
+    int globalSize = Math.min(N, 2 * localSize * localSize);
+    localSize      = Math.min(localSize, globalSize);
+
+    // These have to be created on the host side for the kernel to access them later
+    // int[] a = new int[N] Folie 180 das input array
+    int[] result = new int[2 * localSize]; // Folie 180
+    int[] summe = { 0 }; // Folie 180
+
+    OpenCLReduce5Kernel k = new OpenCLReduce5Kernel(a, localSize, result, summe);
+
+    try {
+      System.out.printf("KERNEL MAX WG SIZE: %d\n", k.getKernelMaxWorkGroupSize(App.device));
+    } catch(Exception e) {
+
+    }
+
+    // Manually manage variables that are accessible and returned by kernel
+    // Anything that is passed to the constructor is still automatically pushed once
+    // to the Memory on the gpu. This is done to reduce the amount of times
+    // the data has to be transferred to the gpu memory via slow pci bus
+    k.setExplicit(true);
+
+    Range r1 = Range.create(App.device, globalSize, localSize);
+    Range r2 = Range.create(App.device, localSize, localSize);
+
+    // Not necessary (So maybe anything used by the kernel is put once?)
+    // k.put(a);
+
+    k.execute(r1);
+    double ct = k.getConversionTime();
+
+    k.setStep(2);
+    k.execute(r2);
+    k.get(summe);
+
+    double et = k.getAccumulatedExecutionTime() - ct;
+    System.out.printf("R5 - WANT: %d, GOT: %d, TIME: %.2f ms\n", a.length, summe[0], et);
+
+  }
+}
--- a/src/main/java/edu/thi/phga/aparapi_test/OpenCLReduce5Kernel.java
+++ b/src/main/java/edu/thi/phga/aparapi_test/OpenCLReduce5Kernel.java
@ -0,0 +1,76 @@
+package edu.thi.phga.aparapi_test;
+
+import com.aparapi.Kernel;
+
+public class OpenCLReduce5Kernel extends Kernel {
+  private int[] a, result, summe;
+  private int step = 1;
+  @Local
+  private int[] scratch;
+
+  public OpenCLReduce5Kernel(int[] a, int localSize, int[] result, int[] summe) {
+    this.a = a;
+    this.result = result;
+    this.summe = summe;
+    scratch = new int[localSize];
+  }
+
+  @Override
+  public void run() {
+    if (this.step == 1) {
+      step1();
+    } else {
+      step2();
+    }
+  }
+
+  public void setStep(int step) {
+    this.step = step;
+  }
+
+  private void step1() {
+    int globalId = getGlobalId();
+    int localId = getLocalId();
+    int globalSize = getGlobalSize();
+
+    int sum = 0;
+
+    for (int i = globalId; i < a.length; i += globalSize) {
+      sum += a[i];
+    }
+
+    scratch[localId] = sum;
+    localBarrier();
+
+    for (int s = getLocalSize() / 2; s > 0; s /= 2) {
+      if (localId < s) {
+        scratch[localId] += scratch[localId + s];
+      }
+      localBarrier();
+    }
+
+    if (localId == 0) {
+      result[getGroupId()] = scratch[0];
+    }
+  }
+
+  // Verarbeitung der einzelnen Elemente in result[]
+  // Erst wieder einmal sequenziell, dann parallel
+  private void step2() {
+    int localId = getLocalId();
+    int localSize = getLocalSize();
+    scratch[localId] = result[localId] + result[localId + localSize];
+    localBarrier(); // Wait for all to finish (locally)
+
+    for (int s = localSize / 2; s > 0; s /= 2) {
+      if (localId < s) {
+        scratch[localId] += scratch[localId + s];
+      }
+      localBarrier();
+    }
+    if (localId == 0) {
+      summe[0] = scratch[0];
+    }
+  }
+
+}
--- a/src/main/java/edu/thi/phga/aparapi_test/OpenCLSizeTest.java
+++ b/src/main/java/edu/thi/phga/aparapi_test/OpenCLSizeTest.java
@ -0,0 +1,47 @@
+package edu.thi.phga.aparapi_test;
+
+import com.aparapi.Kernel;
+import com.aparapi.Range;
+
+public class OpenCLSizeTest {
+  public static void start(int n) {
+    final int N = n;
+    int[] res = new int[N];
+    int[] gids = new int[N];
+    int[] lids = new int[N];
+    int[] gsizes = new int[N];
+    int[] lsizes = new int[N];
+
+    new Kernel() {
+      @Override
+      public void run() {
+        int i = getGlobalId();
+
+        int gi = getGlobalId();
+        int li = getLocalId();
+        int gs = getGlobalSize();
+        int ls = getLocalSize();
+
+        gids[i] = gi;
+        lids[i] = li;
+        gsizes[i] = gs;
+        lsizes[i] = ls;
+
+        res[i] = 10 + i;
+      }
+    }.execute(Range.create(N, 4));
+
+    if (N < 3000) {
+      for (int x = 0; x < N; x++) {
+        System.out.printf("GI: %d, LI: %d, GSize: %d, LSize: %d, RES: %d\n",
+          gids[x], lids[x], gsizes[x], lsizes[x], res[x]);
+
+      }
+    } else {
+        System.out.printf("GI: %d, LI: %d, GSize: %d, LSize: %d, RES: %d\n",
+          gids[0], lids[0], gsizes[0], lsizes[0], res[0]);
+        System.out.printf("GI: %d, LI: %d, GSize: %d, LSize: %d, RES: %d\n",
+          gids[N - 1], lids[N - 1], gsizes[N - 1], lsizes[N - 1], res[N - 1]);
+    }
+  }
+}
--- a/src/main/java/edu/thi/phga/aparapi_test/VAdd.java
+++ b/src/main/java/edu/thi/phga/aparapi_test/VAdd.java
@ -0,0 +1,46 @@
+package edu.thi.phga.aparapi_test;
+
+import com.aparapi.Kernel;
+import com.aparapi.Range;
+import com.aparapi.device.Device;
+
+public class VAdd {
+  public VAdd() {
+    // System.setProperty("com.aparapi.enableShowGeneratedOpenCL", "true");
+    int[] a = {1, 2, 3, 4};
+    int[] b = {5, 6, 7, 8};
+    int[] c = new int[a.length];
+
+    Device device = Devices.selectDevice();
+    Range r = Range.create(device, a.length);
+    Kernel vak = new VAddKernel(a, b, c);
+    vak.execute(r); // Execution erzwingen um ConversionTime zu erhalten
+    final double ctime = vak.getConversionTime();
+    final double etime = vak.getExecutionTime();
+
+    System.out.println("Conversion Time: " + ctime + " ms");
+    System.out.println("Execution Time: " + etime + " ms");
+    System.out.println("Execution w/o Conversion Time: " + (etime - ctime) + " ms");
+
+    vak.execute(r);
+    final double etime2 = vak.getExecutionTime();
+    System.out.println("Execution Time(2): " + etime2 + " ms");
+
+    for (int i = 0; i < a.length; i++)
+      System.out.printf("%d + %d = %2d\n", a[i], b[i], c[i]);
+  }
+
+
+  // CPU
+  // Conversion Time: 309.714392 ms
+  // Execution Time: 310.215247 ms
+  // Execution w/o Conversion Time: 0.5008550000000014 ms
+  // Execution Time(2): 0.103835 ms
+
+
+  // GPU
+  // Conversion Time: 249.313076 ms
+  // Execution Time: 250.076624 ms
+  // Execution w/o Conversion Time: 0.7635480000000143 ms
+  // Execution Time(2): 0.075589 ms
+}
--- a/src/main/java/edu/thi/phga/aparapi_test/VAddKernel.java
+++ b/src/main/java/edu/thi/phga/aparapi_test/VAddKernel.java
@ -0,0 +1,19 @@
+package edu.thi.phga.aparapi_test;
+
+import com.aparapi.Kernel;
+
+public class VAddKernel extends Kernel {
+  private int[] a, b, c;
+
+  public VAddKernel(int[] a, int[] b, int[] c) {
+    this.a = a;
+    this.b = b;
+    this.c = c;
+  }
+
+  @Override
+  public void run() {
+    int i = getGlobalId(0);
+    c[i] = a[i] + b[i];
+  }
+}
--- a/src/test/java/edu/thi/phga/aparapi_test/AppTest.java
+++ b/src/test/java/edu/thi/phga/aparapi_test/AppTest.java
@ -0,0 +1,20 @@
+package edu.thi.phga.aparapi_test;
+
+import static org.junit.Assert.assertTrue;
+
+import org.junit.Test;
+
+/**
+ * Unit test for simple App.
+ */
+public class AppTest 
+{
+    /**
+     * Rigorous Test :-)
+     */
+    @Test
+    public void shouldAnswerWithTrue()
+    {
+        assertTrue( true );
+    }
+}
				`@ -0,0 +1 @@`
				`((nil . ((projectile-project-install-cmd . "mvn -B clean compile assembly:single && java -jar target/*.jar"))))`