commit 6e15d25210d2d07095b38ab0b939fbe47513bbd6 Author: qhga Date: Mon Jan 10 23:50:17 2022 +0100 init: Beep Boop diff --git a/.dir-locals.el b/.dir-locals.el new file mode 100644 index 0000000..3860e08 --- /dev/null +++ b/.dir-locals.el @@ -0,0 +1 @@ +((nil . ((projectile-project-install-cmd . "mvn -B clean compile assembly:single && java -jar target/*.jar")))) diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5be812d --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +target/ +.settings/ +.project +.classpath \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..33df90a --- /dev/null +++ b/README.md @@ -0,0 +1,26 @@ +# How to run + +Right now, I am to lazy to create an appropriate cli. Therefore, it is that hacky. Sorry... + +```sh +# Device selected by Devices.java (One has to know the selection in advance though...) +# DEFAULT = 1 +DEVICE=1 + +# The different targets in App.java +# 0 = Informational output +# 10 = All PrimeNumber related implementations at once +# 20 = All Reduce related implementations at once +# 21 - 25 = Reduce1 - Reduce5 respectively +# 30 = All Prefix related implementations at once +# 31 - 32 = Prefix1 - Prefix2 respectively +# DEFAULT = 0 +TARGET=31 + +# How many shifts for N (e.g. if N should be 8, N_LSHIFTS should be 3) +# Targes 2X and 3X depend on this parameter +# DEFAULT = 25 +N_LSHIFTS=3 + +mvn -B clean compile assembly:single && java -jar target/*.jar $DEVICE $TARGET $N_LSHIFTS +``` \ No newline at end of file diff --git a/pom.xml b/pom.xml new file mode 100644 index 0000000..d9d1262 --- /dev/null +++ b/pom.xml @@ -0,0 +1,103 @@ + + + + 4.0.0 + + edu.thi.phga + aparapi-test + 0.0.1-SNAPSHOT + + aparapi-test + + http://www.example.com + + + UTF-8 + 17 + 17 + + + + + + com.aparapi + aparapi + 3.0.0 + + + + junit + junit + 4.11 + test + + + + + + + + + maven-clean-plugin + 3.1.0 + + + + maven-resources-plugin + 3.0.2 + + + maven-compiler-plugin + 3.8.0 + + + maven-surefire-plugin + 2.22.1 + + + maven-jar-plugin + + + + edu.thi.phga.aparapi_test.App + + + + + 3.0.2 + + + maven-assembly-plugin + + + + edu.thi.phga.aparapi_test.App + + + + jar-with-dependencies + + + + + maven-install-plugin + 2.5.2 + + + maven-deploy-plugin + 2.8.2 + + + + maven-site-plugin + 3.7.1 + + + maven-project-info-reports-plugin + 3.0.0 + + + + + diff --git a/src/main/java/edu/thi/phga/aparapi_test/App.java b/src/main/java/edu/thi/phga/aparapi_test/App.java new file mode 100644 index 0000000..c9cd070 --- /dev/null +++ b/src/main/java/edu/thi/phga/aparapi_test/App.java @@ -0,0 +1,100 @@ +package edu.thi.phga.aparapi_test; + +import com.aparapi.device.OpenCLDevice; + +public class App { + public static int choice; + public static OpenCLDevice device; + private static void printHeader(final String txt) { + final String spacer = + "######################################################################"; + System.out.printf("%s\n%s\n", spacer, txt); + } + public static void main( String[] args ) { + // System.out.println("Running the OpenCL Tasks"); + if (args.length == 0) { + App.choice = 0; + } else { + App.choice = Integer.parseInt(args[0]) - 1; + } + App.device = Devices.selectDevice(); + + int target = 0; + if (args.length > 1) { + target = Integer.parseInt(args[1]); + } + int shift = 25; + if (args.length > 2) { + shift = Integer.parseInt(args[2]); + } + final int[] b = new int[1 << shift]; + switch (target) { + case 0: + OpenCLGetMemoryInfo.getInfo(); + OpenCLSizeTest.start(8); + break; + // PRIMES + case 10: + printHeader("FindPrimes (Seriell, Parallel, OpenCL)"); + // Ab 1 << 14 stimmt es nicht mehr + // final int[] a = {8, 5, 6, 2, 3, 7, 1, 4}; + // CPU is faster than GPU... + FindPrimes.start(); + FindPrimesThreads.start(); + FindPrimesOpenCL.start(); + break; + // REDUCE + case 20: + java.util.Arrays.fill(b, 1); + OpenCLReduce1.start(b); + java.util.Arrays.fill(b, 1); + OpenCLReduce2.start(b); + java.util.Arrays.fill(b, 1); + OpenCLReduce3.start(b); + java.util.Arrays.fill(b, 1); + OpenCLReduce4.start(b); + java.util.Arrays.fill(b, 1); + OpenCLReduce5.start(b); + break; + case 21: + java.util.Arrays.fill(b, 1); + OpenCLReduce1.start(b); + break; + case 22: + java.util.Arrays.fill(b, 1); + OpenCLReduce2.start(b); + break; + case 23: + java.util.Arrays.fill(b, 1); + OpenCLReduce3.start(b); + break; + case 24: + java.util.Arrays.fill(b, 1); + OpenCLReduce4.start(b); + break; + case 25: + java.util.Arrays.fill(b, 1); + OpenCLReduce5.start(b); + break; + // PREFIX + case 30: + // Up to 1 << 27 + java.util.Arrays.fill(b, 1); + OpenCLPrefix1.start(b); + // Up to 1 << 28 + java.util.Arrays.fill(b, 1); + OpenCLPrefix2.start(b); + break; + case 31: + // Up to 1 << 27 + java.util.Arrays.fill(b, 1); + OpenCLPrefix1.start(b); + break; + case 32: + // Up to 1 << 28 + java.util.Arrays.fill(b, 1); + OpenCLPrefix2.start(b); + break; + } + } +} diff --git a/src/main/java/edu/thi/phga/aparapi_test/Devices.java b/src/main/java/edu/thi/phga/aparapi_test/Devices.java new file mode 100644 index 0000000..c65b57b --- /dev/null +++ b/src/main/java/edu/thi/phga/aparapi_test/Devices.java @@ -0,0 +1,49 @@ +package edu.thi.phga.aparapi_test; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Scanner; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import com.aparapi.device.OpenCLDevice; +import com.aparapi.internal.opencl.OpenCLPlatform; + +public class Devices { + public static OpenCLDevice selectDevice() { + // Informationen über alle verfügbaren OpenCL-Implementierungen ausgeben + + class Processor { // Hilfsklasse + String name, version, typ; + OpenCLDevice device; + + Processor(OpenCLPlatform platform, OpenCLDevice device) { + this.device = device; + name = platform.getName(); + Matcher m = Pattern.compile("\\d.\\d").matcher(platform.getVersion()); + version = m.find() ? m.group() : ""; + typ = device.getType().name(); + } + + @Override public String toString() { + return typ + " - OpenCL " + version + " - " + name; + } + } + + // erzeuge alle Platform-Device-Kombinationen + List processors = new ArrayList<>(); + + for (var platform : OpenCLPlatform.getUncachedOpenCLPlatforms()) + for (var device : platform.getOpenCLDevices()) + processors.add(new Processor(platform, device)); + + for (int i = 0; i < processors.size(); i++) + System.out.println(i + 1 + ") " + processors.get(i)); + + var processor = processors.get(App.choice); + System.out.println("\n" + processor + "\n"); + + return processor.device; + } +} \ No newline at end of file diff --git a/src/main/java/edu/thi/phga/aparapi_test/FindPrimes.java b/src/main/java/edu/thi/phga/aparapi_test/FindPrimes.java new file mode 100644 index 0000000..738f224 --- /dev/null +++ b/src/main/java/edu/thi/phga/aparapi_test/FindPrimes.java @@ -0,0 +1,35 @@ +package edu.thi.phga.aparapi_test; + +public class FindPrimes { + private static final int N = 10_000_000; + private static int START = 123_456_789; + private static boolean[] istPrime = new boolean[N]; + + public static void start() { + long t1 = System.nanoTime(); + primeTest(); + long t2 = System.nanoTime(); + + int a = 0; + for (var b : istPrime) { + if (b) { + a++; + } + } + + System.out.println("Single: " + (t2 - t1) / 1000000 + " ms: " + a); + + } + + private static void primeTest() { + for (int z = START, i = 0; i < N; z += 2, i++) { + int teiler = 3; + + while(z > teiler * teiler && z % teiler != 0){ + teiler += 2; + } + + istPrime[i] = z % teiler != 0; + } + } +} diff --git a/src/main/java/edu/thi/phga/aparapi_test/FindPrimesKernel.java b/src/main/java/edu/thi/phga/aparapi_test/FindPrimesKernel.java new file mode 100644 index 0000000..6d127be --- /dev/null +++ b/src/main/java/edu/thi/phga/aparapi_test/FindPrimesKernel.java @@ -0,0 +1,25 @@ +package edu.thi.phga.aparapi_test; +import com.aparapi.Kernel; + +public class FindPrimesKernel extends Kernel { + private int start; + private boolean[] istPrime; + + public FindPrimesKernel(int start, boolean[] istPrime) { + this.start = start; + this.istPrime = istPrime; + } + + @Override public void run() { + int index = getGlobalId(0); + int zahl = start + index * 2; + int teiler = 3; + + while (teiler * teiler < zahl && zahl % teiler != 0) { + teiler += 2; + } + + istPrime[index] = zahl % teiler != 0; + + } +} diff --git a/src/main/java/edu/thi/phga/aparapi_test/FindPrimesOpenCL.java b/src/main/java/edu/thi/phga/aparapi_test/FindPrimesOpenCL.java new file mode 100644 index 0000000..9d2ee8d --- /dev/null +++ b/src/main/java/edu/thi/phga/aparapi_test/FindPrimesOpenCL.java @@ -0,0 +1,28 @@ +package edu.thi.phga.aparapi_test; + +import com.aparapi.Kernel; +import com.aparapi.Range; + +public class FindPrimesOpenCL { + private static final int N = 10_000_000; + private static final int START = 123_456_789; + private static boolean[] istPrime = new boolean[N]; + + public static void start() { + // + Range r = Range.create(Devices.selectDevice(), N, 250); + Kernel k = new FindPrimesKernel(START, istPrime); + k.execute(r); + + k.execute(r); + + int a = 0; + for (var b : istPrime) { + if (b) { + a++; + } + } + System.out.println("OpenCL: " + k.getExecutionTime() + " ms: " + a); + } + +} diff --git a/src/main/java/edu/thi/phga/aparapi_test/FindPrimesThreads.java b/src/main/java/edu/thi/phga/aparapi_test/FindPrimesThreads.java new file mode 100644 index 0000000..dee0dc9 --- /dev/null +++ b/src/main/java/edu/thi/phga/aparapi_test/FindPrimesThreads.java @@ -0,0 +1,69 @@ +package edu.thi.phga.aparapi_test; + +import java.util.List; +import java.util.concurrent.Callable; +import java.util.concurrent.Executors; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +public class FindPrimesThreads { + private static final int N = 10_000_000; + private static final int P = Runtime.getRuntime().availableProcessors(); + private static int START = 123_456_789; + private static boolean[] istPrime = new boolean[N]; + + public static void start() { + long t1 = System.nanoTime(); + primeTest(); + long t2 = System.nanoTime(); + + int a = 0; + for (var b : istPrime) { + if (b) { + a++; + } + } + + System.out.println("Threads(" + P + "): " + (t2 - t1) / 1000000 + " ms: " + a); + + } + + private static void primeTest() { + // Taskliste anlegen + List> tasks = IntStream + .range(0, P) + .mapToObj(Task::new) + .map(Executors::callable) + .collect(Collectors.toList()); + + try { + Executors.newCachedThreadPool().invokeAll(tasks); + } catch (InterruptedException e) { + + } + + } + + private static class Task implements Runnable { + private static final int SPANNE = N / P; + private int index; + + Task(int index) { + this.index = index; + } + + @Override + public void run() { + int start = START + index * 2 * SPANNE; + for (int z = start, i = 0; i < SPANNE; z += 2, i++) { + int teiler = 3; + + while(z > teiler * teiler && z % teiler != 0){ + teiler += 2; + } + + istPrime[index * SPANNE + i] = z % teiler != 0; + } + } + } +} diff --git a/src/main/java/edu/thi/phga/aparapi_test/OpenCLGetMemoryInfo.java b/src/main/java/edu/thi/phga/aparapi_test/OpenCLGetMemoryInfo.java new file mode 100644 index 0000000..14f49ba --- /dev/null +++ b/src/main/java/edu/thi/phga/aparapi_test/OpenCLGetMemoryInfo.java @@ -0,0 +1,15 @@ +package edu.thi.phga.aparapi_test; + +import com.aparapi.device.Device; + +public class OpenCLGetMemoryInfo { + public static void getInfo() { + int localMem = (int) App.device.getLocalMemSize(); + int maxWGSize = App.device.getMaxWorkGroupSize(); + int localSize = Math.min(maxWGSize, localMem / 4); + + System.out.printf("Memsize: %d, Max-WGSize: %d, LocalSize: %d\n", + localMem, maxWGSize, localSize); + + } +} diff --git a/src/main/java/edu/thi/phga/aparapi_test/OpenCLPrefix1.java b/src/main/java/edu/thi/phga/aparapi_test/OpenCLPrefix1.java new file mode 100644 index 0000000..88190df --- /dev/null +++ b/src/main/java/edu/thi/phga/aparapi_test/OpenCLPrefix1.java @@ -0,0 +1,77 @@ +package edu.thi.phga.aparapi_test; + +import com.aparapi.Kernel; +import com.aparapi.Range; + + +public class OpenCLPrefix1 { + + public static void start(final int[] a) { + int N = a.length; + int b[] = new int[N * 2]; + java.util.Arrays.fill(b, 1); + Range r = Range.create(App.device, N, 4); + OpenCLPrefix1Kernel k = new OpenCLPrefix1Kernel(b); + k.setExplicit(true); + k.setStepSize(1); + k.execute(r); + k.toggleIndex(); + double ct = k.getConversionTime(); + // k.get(b); + // for (int i = 0; i < b.length; i++) { + // System.out.printf("%c[%d] = %d\n", (i < b.length / 2) ? 'A' : 'B', i, b[i]); + // } + + for (int n = 2; n < N; n *= 2) { + k.setStepSize(n); + k.execute(r); + k.toggleIndex(); + // k.get(b); + // for (int i = 0; i < b.length; i++) { + // System.out.printf("%c[%d] = %d\n", (i < b.length / 2) ? 'A' : 'B', i, b[i]); + // } + } + + k.get(b); + double et = k.getAccumulatedExecutionTime() - ct; + + int n = N + k.getIndex(); + System.out.printf("P1 - GOT: %d, %d, %d, TIME: %.2f ms\n", + b[n - 3], b[n - 2], b[n - 1], et); + } + + private static class OpenCLPrefix1Kernel extends Kernel { + private int stepSize; + private int index; + private int N; + private int[] a; + + public OpenCLPrefix1Kernel(int[] a) { + this.a = a; + this.N = a.length / 2; + } + + public void setStepSize(int s) { + this.stepSize = s; + } + + public void toggleIndex() { + index = N - index; + } + + public int getIndex() { + return index; + } + + @Override + public void run() { + int i = getGlobalId(); + int out = N - index; + if (i < stepSize) { + a[out + i] = a[index + i]; + } else { + a[out + i] = a[index + i] + a[index + i - stepSize]; + } + } + } +} diff --git a/src/main/java/edu/thi/phga/aparapi_test/OpenCLPrefix2.java b/src/main/java/edu/thi/phga/aparapi_test/OpenCLPrefix2.java new file mode 100644 index 0000000..920fe96 --- /dev/null +++ b/src/main/java/edu/thi/phga/aparapi_test/OpenCLPrefix2.java @@ -0,0 +1,112 @@ +package edu.thi.phga.aparapi_test; + +import com.aparapi.Kernel; +import com.aparapi.Range; + +public class OpenCLPrefix2 { + + /** + Parallel Prefix implementation with 2 arrays instead + of one array twice the size of the requested input. + + Accumulated execution time was equally good. + Works up to 1 << 28 before OutOfMemoryError + + Rewrite of the original idea by: Prof. Dr. Schmidt + */ + public static void start(int[] a) { + int N = a.length; + int b[] = new int[N]; + java.util.Arrays.fill(b, 1); + Range r = Range.create(App.device, N, 4); + OpenCLPrefix2Kernel k = new OpenCLPrefix2Kernel(a, b); + k.setExplicit(true); + k.setStepSize(1); + k.execute(r); + k.toggleIndex(); + double ct = k.getConversionTime(); + // Uncomment to get Debug output (Try with 1 << 3) + // k.get(a); + // k.get(b); + // System.out.println(k.getIndex()); + // for (int i = 0; i < b.length; i++) { + // System.out.printf("a[%d] = %d\n", i, a[i]); + // } + // for (int i = 0; i < b.length; i++) { + // System.out.printf("b[%d] = %d\n", i, b[i]); + // } + + for (int n = 2; n < N; n *= 2) { + k.setStepSize(n); + k.execute(r); + k.toggleIndex(); + // Uncomment to get Debug output (Try with 1 << 3) + // k.get(a); + // k.get(b); + // System.out.println(k.getIndex()); + // for (int i = 0; i < b.length; i++) { + // System.out.printf("a[%d] = %d\n", i, a[i]); + // } + // for (int i = 0; i < b.length; i++) { + // System.out.printf("b[%d] = %d\n", i, b[i]); + // } + } + + double et = k.getAccumulatedExecutionTime() - ct; + + // Last write was to A + if (k.getIndex() > 0) { + k.get(a); + // Last write was to B + } else { + k.get(b); + a = b; + } + System.out.printf("P2 - GOT: %d, %d, %d, TIME: %.2f ms\n", + a[N - 3], a[N - 2], a[N - 1], et); + + } + + private static class OpenCLPrefix2Kernel extends Kernel { + private int stepSize; + private int index = 1; + private int[] a, b; + + public OpenCLPrefix2Kernel(int[] a, int[] b) { + this.a = a; + this.b = b; + } + + public void setStepSize(int s) { + this.stepSize = s; + } + + public void toggleIndex() { + index = -index; + } + + public int getIndex() { + return index; + } + + @Override + public void run() { + int i = getGlobalId(); + // A -> B + if (index > 0) { + if (i < stepSize) { + b[i] = a[i]; + } else { + b[i] = a[i] + a[i - stepSize]; + } + // B -> A + } else { + if (i < stepSize) { + a[i] = b[i]; + } else { + a[i] = b[i] + b[i - stepSize]; + } + } + } + } +} \ No newline at end of file diff --git a/src/main/java/edu/thi/phga/aparapi_test/OpenCLReduce1.java b/src/main/java/edu/thi/phga/aparapi_test/OpenCLReduce1.java new file mode 100644 index 0000000..d168ec4 --- /dev/null +++ b/src/main/java/edu/thi/phga/aparapi_test/OpenCLReduce1.java @@ -0,0 +1,40 @@ +package edu.thi.phga.aparapi_test; + +import com.aparapi.Kernel; +import com.aparapi.Range; + +public class OpenCLReduce1 { + public static void start(final int[] a) { + int optimalSize = a.length / 2; + int[] gs = new int[optimalSize]; + + Kernel k = new Kernel() { + @Override + public void run() { + int i = getGlobalId(); + int size = getGlobalSize(); + + gs[i] = size; + + for (int s = 1; s <= size; s *= 2) { + if (i % s == 0) { + a[2 * i] += a[2 * i + s]; + } + } + } + }; + k.execute(Range.create(optimalSize, optimalSize <= 256 ? optimalSize : 256)); + double ct = k.getConversionTime(); + double et = k.getExecutionTime() - ct; + + // if (a.length <= 64) { + // for (int x = 0; x < optimalSize; x++) { + // System.out.printf("GI: %d, GSize: %d\n", x, gs[x]); + // } + // } else { + // System.out.printf("GI: %d, GSize: %d\n", 0, gs[0]); + // System.out.printf("GI: %d, GSize: %d\n", optimalSize - 1, gs[optimalSize - 1]); + // } + System.out.printf("R1 - WANT: %d, GOT: %d, TIME: %.2f ms\n", a.length, a[0], et); + } +} diff --git a/src/main/java/edu/thi/phga/aparapi_test/OpenCLReduce2.java b/src/main/java/edu/thi/phga/aparapi_test/OpenCLReduce2.java new file mode 100644 index 0000000..e503752 --- /dev/null +++ b/src/main/java/edu/thi/phga/aparapi_test/OpenCLReduce2.java @@ -0,0 +1,35 @@ +package edu.thi.phga.aparapi_test; + +import com.aparapi.Kernel; +import com.aparapi.Range; + +public class OpenCLReduce2 { + public static void start(int[] a) { + int optimalSize = a.length / 2; + int N = a.length; + int[] gs = new int[optimalSize]; + + Kernel k = new Kernel() { + @Override + public void run() { + int size = N / getGlobalSize(); // 2, 4, 8, ... + int i = getGlobalId() * size; // 2: 0 -> 0, 1 -> 2, 2 -> 4, 3 -> 6, ... + + gs[getGlobalId()] = size; + + a[i] += a[i + size / 2]; // 1, 2, 4, .. + } + }; + + k.execute(Range.create(N / 2, optimalSize < 256 ? optimalSize : 256)); + double ct = k.getConversionTime(); + + for (int n = N / 4; n >= 1; n /= 2) { + k.execute(Range.create(n, n < 256 ? n : 256)); + } + + double et = k.getAccumulatedExecutionTime() - ct; + System.out.printf("R2 - WANT: %d, GOT: %d, TIME: %.2f ms\n", a.length, a[0], et); + + } +} diff --git a/src/main/java/edu/thi/phga/aparapi_test/OpenCLReduce3.java b/src/main/java/edu/thi/phga/aparapi_test/OpenCLReduce3.java new file mode 100644 index 0000000..7567f2c --- /dev/null +++ b/src/main/java/edu/thi/phga/aparapi_test/OpenCLReduce3.java @@ -0,0 +1,51 @@ +package edu.thi.phga.aparapi_test; + +import com.aparapi.Kernel; +import com.aparapi.Range; + +public class OpenCLReduce3 { + public static void start(int[] a) { + int optimalSize = a.length / 2; + int N = a.length; + int[] gs = new int[optimalSize]; + + Kernel k = new Kernel() { + @Override + public void run() { + int size = N / getGlobalSize(); // 2, 4, 8, ... + int i = getGlobalId() * size; // 2: 0 -> 0, 1 -> 2, 2 -> 4, 3 -> 6, ... + + gs[getGlobalId()] = size; + + a[i] += a[i + size / 2]; // 1, 2, 4, .. + } + }; + + // Manually manage variables that are accessible and returned by kernel + // Anything that is passed to the constructor is still automatically pushed once + // to the Memory on the gpu. This is done to reduce the amount of times + // the data has to be transferred to the gpu memory via slow pci bus + k.setExplicit(true); + + try { + System.out.printf("MAX WG SIZE: %d\n", k.getKernelMaxWorkGroupSize(App.device)); + } catch(Exception e) { + + } + // Not necessary (So maybe anything used by the kernel is put once?) + // k.put(a); + + k.execute(Range.create(N / 2, optimalSize < 256 ? optimalSize : 256)); + double ct = k.getConversionTime(); + + for (int n = N / 4; n >= 1; n /= 2) { + k.execute(Range.create(n, n < 256 ? n : 256)); + } + + k.get(a); + + double et = k.getAccumulatedExecutionTime() - ct; + System.out.printf("R3 - WANT: %d, GOT: %d, TIME: %.2f ms\n", a.length, a[0], et); + + } +} diff --git a/src/main/java/edu/thi/phga/aparapi_test/OpenCLReduce4.java b/src/main/java/edu/thi/phga/aparapi_test/OpenCLReduce4.java new file mode 100644 index 0000000..75b55aa --- /dev/null +++ b/src/main/java/edu/thi/phga/aparapi_test/OpenCLReduce4.java @@ -0,0 +1,51 @@ +package edu.thi.phga.aparapi_test; + +import com.aparapi.Kernel; +import com.aparapi.Range; + +public class OpenCLReduce4 { + public static void start(int[] a) { + int optimalSize = a.length / 2; + int N = a.length; + int[] gs = new int[optimalSize]; + + Kernel k = new Kernel() { + @Override + public void run() { + int size = getGlobalSize(); // 2, 4, 8, ... + int i = getGlobalId(); // 2: 0 -> 0, 1 -> 2, 2 -> 4, 3 -> 6, ... + + gs[getGlobalId()] = size; + + a[i] += a[i + size]; // 1, 2, 4, .. + } + }; + + // Manually manage variables that are accessible and returned by kernel + // Anything that is passed to the constructor is still automatically pushed once + // to the Memory on the gpu. This is done to reduce the amount of times + // the data has to be transferred to the gpu memory via slow pci bus + k.setExplicit(true); + + try { + System.out.printf("MAX WG SIZE: %d\n", k.getKernelMaxWorkGroupSize(App.device)); + } catch(Exception e) { + + } + // Not necessary (So maybe anything used by the kernel is put once?) + // k.put(a); + + k.execute(Range.create(N / 2, optimalSize < 256 ? optimalSize : 256)); + double ct = k.getConversionTime(); + + for (int n = N / 4; n >= 1; n /= 2) { + k.execute(Range.create(n, n < 256 ? n : 256)); + } + + k.get(a); + + double et = k.getAccumulatedExecutionTime() - ct; + System.out.printf("R4 - WANT: %d, GOT: %d, TIME: %.2f ms\n", a.length, a[0], et); + + } +} diff --git a/src/main/java/edu/thi/phga/aparapi_test/OpenCLReduce5.java b/src/main/java/edu/thi/phga/aparapi_test/OpenCLReduce5.java new file mode 100644 index 0000000..d4e4bb9 --- /dev/null +++ b/src/main/java/edu/thi/phga/aparapi_test/OpenCLReduce5.java @@ -0,0 +1,52 @@ +package edu.thi.phga.aparapi_test; + +import com.aparapi.Range; + +public class OpenCLReduce5 { + public static void start(int[] a) { + int N = a.length; + + int localMem = (int) App.device.getLocalMemSize(); + int maxWGSize = Math.min(App.device.getMaxWorkGroupSize(), 256); + + int localSize = Math.min(maxWGSize, localMem / 4); + int globalSize = Math.min(N, 2 * localSize * localSize); + localSize = Math.min(localSize, globalSize); + + // These have to be created on the host side for the kernel to access them later + // int[] a = new int[N] Folie 180 das input array + int[] result = new int[2 * localSize]; // Folie 180 + int[] summe = { 0 }; // Folie 180 + + OpenCLReduce5Kernel k = new OpenCLReduce5Kernel(a, localSize, result, summe); + + try { + System.out.printf("KERNEL MAX WG SIZE: %d\n", k.getKernelMaxWorkGroupSize(App.device)); + } catch(Exception e) { + + } + + // Manually manage variables that are accessible and returned by kernel + // Anything that is passed to the constructor is still automatically pushed once + // to the Memory on the gpu. This is done to reduce the amount of times + // the data has to be transferred to the gpu memory via slow pci bus + k.setExplicit(true); + + Range r1 = Range.create(App.device, globalSize, localSize); + Range r2 = Range.create(App.device, localSize, localSize); + + // Not necessary (So maybe anything used by the kernel is put once?) + // k.put(a); + + k.execute(r1); + double ct = k.getConversionTime(); + + k.setStep(2); + k.execute(r2); + k.get(summe); + + double et = k.getAccumulatedExecutionTime() - ct; + System.out.printf("R5 - WANT: %d, GOT: %d, TIME: %.2f ms\n", a.length, summe[0], et); + + } +} diff --git a/src/main/java/edu/thi/phga/aparapi_test/OpenCLReduce5Kernel.java b/src/main/java/edu/thi/phga/aparapi_test/OpenCLReduce5Kernel.java new file mode 100644 index 0000000..d57a6dc --- /dev/null +++ b/src/main/java/edu/thi/phga/aparapi_test/OpenCLReduce5Kernel.java @@ -0,0 +1,76 @@ +package edu.thi.phga.aparapi_test; + +import com.aparapi.Kernel; + +public class OpenCLReduce5Kernel extends Kernel { + private int[] a, result, summe; + private int step = 1; + @Local + private int[] scratch; + + public OpenCLReduce5Kernel(int[] a, int localSize, int[] result, int[] summe) { + this.a = a; + this.result = result; + this.summe = summe; + scratch = new int[localSize]; + } + + @Override + public void run() { + if (this.step == 1) { + step1(); + } else { + step2(); + } + } + + public void setStep(int step) { + this.step = step; + } + + private void step1() { + int globalId = getGlobalId(); + int localId = getLocalId(); + int globalSize = getGlobalSize(); + + int sum = 0; + + for (int i = globalId; i < a.length; i += globalSize) { + sum += a[i]; + } + + scratch[localId] = sum; + localBarrier(); + + for (int s = getLocalSize() / 2; s > 0; s /= 2) { + if (localId < s) { + scratch[localId] += scratch[localId + s]; + } + localBarrier(); + } + + if (localId == 0) { + result[getGroupId()] = scratch[0]; + } + } + + // Verarbeitung der einzelnen Elemente in result[] + // Erst wieder einmal sequenziell, dann parallel + private void step2() { + int localId = getLocalId(); + int localSize = getLocalSize(); + scratch[localId] = result[localId] + result[localId + localSize]; + localBarrier(); // Wait for all to finish (locally) + + for (int s = localSize / 2; s > 0; s /= 2) { + if (localId < s) { + scratch[localId] += scratch[localId + s]; + } + localBarrier(); + } + if (localId == 0) { + summe[0] = scratch[0]; + } + } + +} diff --git a/src/main/java/edu/thi/phga/aparapi_test/OpenCLSizeTest.java b/src/main/java/edu/thi/phga/aparapi_test/OpenCLSizeTest.java new file mode 100644 index 0000000..d1697cd --- /dev/null +++ b/src/main/java/edu/thi/phga/aparapi_test/OpenCLSizeTest.java @@ -0,0 +1,47 @@ +package edu.thi.phga.aparapi_test; + +import com.aparapi.Kernel; +import com.aparapi.Range; + +public class OpenCLSizeTest { + public static void start(int n) { + final int N = n; + int[] res = new int[N]; + int[] gids = new int[N]; + int[] lids = new int[N]; + int[] gsizes = new int[N]; + int[] lsizes = new int[N]; + + new Kernel() { + @Override + public void run() { + int i = getGlobalId(); + + int gi = getGlobalId(); + int li = getLocalId(); + int gs = getGlobalSize(); + int ls = getLocalSize(); + + gids[i] = gi; + lids[i] = li; + gsizes[i] = gs; + lsizes[i] = ls; + + res[i] = 10 + i; + } + }.execute(Range.create(N, 4)); + + if (N < 3000) { + for (int x = 0; x < N; x++) { + System.out.printf("GI: %d, LI: %d, GSize: %d, LSize: %d, RES: %d\n", + gids[x], lids[x], gsizes[x], lsizes[x], res[x]); + + } + } else { + System.out.printf("GI: %d, LI: %d, GSize: %d, LSize: %d, RES: %d\n", + gids[0], lids[0], gsizes[0], lsizes[0], res[0]); + System.out.printf("GI: %d, LI: %d, GSize: %d, LSize: %d, RES: %d\n", + gids[N - 1], lids[N - 1], gsizes[N - 1], lsizes[N - 1], res[N - 1]); + } + } +} diff --git a/src/main/java/edu/thi/phga/aparapi_test/VAdd.java b/src/main/java/edu/thi/phga/aparapi_test/VAdd.java new file mode 100644 index 0000000..137be49 --- /dev/null +++ b/src/main/java/edu/thi/phga/aparapi_test/VAdd.java @@ -0,0 +1,46 @@ +package edu.thi.phga.aparapi_test; + +import com.aparapi.Kernel; +import com.aparapi.Range; +import com.aparapi.device.Device; + +public class VAdd { + public VAdd() { + // System.setProperty("com.aparapi.enableShowGeneratedOpenCL", "true"); + int[] a = {1, 2, 3, 4}; + int[] b = {5, 6, 7, 8}; + int[] c = new int[a.length]; + + Device device = Devices.selectDevice(); + Range r = Range.create(device, a.length); + Kernel vak = new VAddKernel(a, b, c); + vak.execute(r); // Execution erzwingen um ConversionTime zu erhalten + final double ctime = vak.getConversionTime(); + final double etime = vak.getExecutionTime(); + + System.out.println("Conversion Time: " + ctime + " ms"); + System.out.println("Execution Time: " + etime + " ms"); + System.out.println("Execution w/o Conversion Time: " + (etime - ctime) + " ms"); + + vak.execute(r); + final double etime2 = vak.getExecutionTime(); + System.out.println("Execution Time(2): " + etime2 + " ms"); + + for (int i = 0; i < a.length; i++) + System.out.printf("%d + %d = %2d\n", a[i], b[i], c[i]); + } + + + // CPU + // Conversion Time: 309.714392 ms + // Execution Time: 310.215247 ms + // Execution w/o Conversion Time: 0.5008550000000014 ms + // Execution Time(2): 0.103835 ms + + + // GPU + // Conversion Time: 249.313076 ms + // Execution Time: 250.076624 ms + // Execution w/o Conversion Time: 0.7635480000000143 ms + // Execution Time(2): 0.075589 ms +} diff --git a/src/main/java/edu/thi/phga/aparapi_test/VAddKernel.java b/src/main/java/edu/thi/phga/aparapi_test/VAddKernel.java new file mode 100644 index 0000000..8d4b988 --- /dev/null +++ b/src/main/java/edu/thi/phga/aparapi_test/VAddKernel.java @@ -0,0 +1,19 @@ +package edu.thi.phga.aparapi_test; + +import com.aparapi.Kernel; + +public class VAddKernel extends Kernel { + private int[] a, b, c; + + public VAddKernel(int[] a, int[] b, int[] c) { + this.a = a; + this.b = b; + this.c = c; + } + + @Override + public void run() { + int i = getGlobalId(0); + c[i] = a[i] + b[i]; + } +} diff --git a/src/test/java/edu/thi/phga/aparapi_test/AppTest.java b/src/test/java/edu/thi/phga/aparapi_test/AppTest.java new file mode 100644 index 0000000..547ed16 --- /dev/null +++ b/src/test/java/edu/thi/phga/aparapi_test/AppTest.java @@ -0,0 +1,20 @@ +package edu.thi.phga.aparapi_test; + +import static org.junit.Assert.assertTrue; + +import org.junit.Test; + +/** + * Unit test for simple App. + */ +public class AppTest +{ + /** + * Rigorous Test :-) + */ + @Test + public void shouldAnswerWithTrue() + { + assertTrue( true ); + } +}