This commit is contained in:
shenjack 2024-10-16 21:26:54 +08:00
commit fc013bae96
Signed by: shenjack
GPG Key ID: 7B1134A979775551
5 changed files with 469 additions and 0 deletions

1
.gitignore vendored Normal file
View File

@ -0,0 +1 @@
/target

210
Cargo.lock generated Normal file
View File

@ -0,0 +1,210 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 3
[[package]]
name = "addr2line"
version = "0.22.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6e4503c46a5c0c7844e948c9a4d6acd9f50cccb4de1c48eb9e291ea17470c678"
dependencies = [
"gimli",
]
[[package]]
name = "adler"
version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
[[package]]
name = "anyhow"
version = "1.0.86"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b3d1d046238990b9cf5bcde22a3fb3584ee5cf65fb2765f454ed428c7a0063da"
dependencies = [
"backtrace",
]
[[package]]
name = "backtrace"
version = "0.3.73"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5cc23269a4f8976d0a4d2e7109211a419fe30e8d88d677cd60b6bc79c5732e0a"
dependencies = [
"addr2line",
"cc",
"cfg-if",
"libc",
"miniz_oxide",
"object",
"rustc-demangle",
]
[[package]]
name = "cc"
version = "1.0.104"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "74b6a57f98764a267ff415d50a25e6e166f3831a5071af4995296ea97d210490"
[[package]]
name = "cfg-if"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "cl3"
version = "0.9.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b823f24e72fa0c68aa14a250ae1c0848e68d4ae188b71c3972343e45b46f8644"
dependencies = [
"libc",
"opencl-sys",
"thiserror",
]
[[package]]
name = "gimli"
version = "0.29.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "40ecd4077b5ae9fd2e9e169b102c6c330d0605168eb0e8bf79952b256dbefffd"
[[package]]
name = "libc"
version = "0.2.155"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c"
[[package]]
name = "memchr"
version = "2.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
[[package]]
name = "miniz_oxide"
version = "0.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b8a240ddb74feaf34a79a7add65a741f3167852fba007066dcac1ca548d89c08"
dependencies = [
"adler",
]
[[package]]
name = "object"
version = "0.36.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "081b846d1d56ddfc18fdf1a922e4f6e07a11768ea1b92dec44e42b72712ccfce"
dependencies = [
"memchr",
]
[[package]]
name = "opencl-sys"
version = "0.2.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "de15dd01496ae90c5799f5266184ab020082b4065800ff0b732f489371d0e5cf"
dependencies = [
"libc",
]
[[package]]
name = "opencl-test"
version = "0.1.0"
dependencies = [
"anyhow",
"opencl3",
]
[[package]]
name = "opencl3"
version = "0.9.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "26ab4a90cb496f787d3934deb0c54fa9d65e7bed710c10071234aab0196fba04"
dependencies = [
"cl3",
"libc",
"serde",
]
[[package]]
name = "proc-macro2"
version = "1.0.86"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quote"
version = "1.0.36"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7"
dependencies = [
"proc-macro2",
]
[[package]]
name = "rustc-demangle"
version = "0.1.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f"
[[package]]
name = "serde"
version = "1.0.203"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7253ab4de971e72fb7be983802300c30b5a7f0c2e56fab8abfc6a214307c0094"
dependencies = [
"serde_derive",
]
[[package]]
name = "serde_derive"
version = "1.0.203"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "500cbc0ebeb6f46627f50f3f5811ccf6bf00643be300b4c3eabc0ef55dc5b5ba"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "syn"
version = "2.0.68"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "901fa70d88b9d6c98022e23b4136f9f3e54e4662c3bc1bd1d84a42a9a0f0c1e9"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]]
name = "thiserror"
version = "1.0.61"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c546c80d6be4bc6a00c0f01730c08df82eaa7a7a61f11d656526506112cc1709"
dependencies = [
"thiserror-impl",
]
[[package]]
name = "thiserror-impl"
version = "1.0.61"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "46c3384250002a6d5af4d114f2845d37b57521033f30d5c3f46c4d70e1197533"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "unicode-ident"
version = "1.0.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"

8
Cargo.toml Normal file
View File

@ -0,0 +1,8 @@
[package]
name = "opencl-test"
version = "0.1.0"
edition = "2021"
[dependencies]
anyhow = { version = "1.0", features = ["backtrace"] }
opencl3 = { version = "0.9.5", features = ["serde"] }

157
src/main.rs Normal file
View File

@ -0,0 +1,157 @@
use opencl3::command_queue::{CommandQueue, CL_QUEUE_PROFILING_ENABLE};
use opencl3::context::Context;
use opencl3::device::{get_all_devices, get_device_info, Device, CL_DEVICE_MAX_WORK_GROUP_SIZE_AMD, CL_DEVICE_TYPE_GPU};
use opencl3::kernel::{ExecuteKernel, Kernel};
use opencl3::memory::{Buffer, CL_MAP_WRITE, CL_MEM_READ_ONLY};
use opencl3::program::Program;
use opencl3::svm::SvmVec;
use opencl3::types::{cl_int, cl_uchar, CL_BLOCKING};
use std::ptr;
const PROGRAM_SOURCE: &str = include_str!("program.cl");
const KERNEL_NAME: &str = "load_team";
const BLOCK_SIZE: usize = 256;
fn main() -> anyhow::Result<()> {
// Find a usable device for this application
let device_id = *get_all_devices(CL_DEVICE_TYPE_GPU)?
.first()
.expect("no device found in platform");
let size = match get_device_info(device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE_AMD) {
Ok(size) => {
size.to_size()
},
Err(err) => {
println!("get_device_info failed: {}", err);
panic!();
}
};
let device = Device::new(device_id);
let worker_count: cl_int = size as cl_int;
println!("device max work group size: {} real count: {}", size, worker_count);
// Create a Context on an OpenCL device
let context = Context::from_device(&device).expect("Context::from_device failed");
let queue = CommandQueue::create_default_with_properties(
&context,
CL_QUEUE_PROFILING_ENABLE,
worker_count as u32,
)
.expect("create_command_queue_with_properties failed");
// Build the OpenCL program source and create the kernel.
let program = match Program::create_and_build_from_source(&context, PROGRAM_SOURCE, "") {
Ok(p) => p,
Err(err) => {
println!(
"OpenCL Program::create_and_build_from_source failed: {}",
err
);
panic!();
}
};
let kernel = Kernel::create(&program, KERNEL_NAME).expect("Kernel::create failed");
let team_raw_vec = vec!["x"; worker_count as usize];
let name_raw_vec = vec!["x"; worker_count as usize];
let team_bytes_vec = team_raw_vec.iter().map(|s| s.as_bytes()).collect::<Vec<&[u8]>>();
let name_bytes_vec = name_raw_vec.iter().map(|s| s.as_bytes()).collect::<Vec<&[u8]>>();
let t_len_vec = team_bytes_vec.iter().map(|s| s.len() as cl_int + 1).collect::<Vec<i32>>();
let n_len_vec = name_bytes_vec.iter().map(|s| s.len() as cl_int).collect::<Vec<i32>>();
let work_count = team_bytes_vec.len();
// Create OpenCL device buffers
let mut team = unsafe {
Buffer::<cl_uchar>::create(
&context,
CL_MEM_READ_ONLY,
BLOCK_SIZE * work_count,
ptr::null_mut(),
)?
};
let mut name = unsafe {
Buffer::<cl_uchar>::create(
&context,
CL_MEM_READ_ONLY,
BLOCK_SIZE * work_count,
ptr::null_mut(),
)?
};
let mut t_len = unsafe {
Buffer::<cl_int>::create(&context, CL_MEM_READ_ONLY, work_count, ptr::null_mut())?
};
let mut n_len = unsafe {
Buffer::<cl_int>::create(&context, CL_MEM_READ_ONLY, work_count, ptr::null_mut())?
};
let mut output = SvmVec::<cl_uchar>::allocate(&context, BLOCK_SIZE * work_count)?;
// 准备一下数据, 都给拼成一维数组
// 填充成 256 * len
let team_data_vec = {
let mut vec = Vec::new();
for data in team_bytes_vec {
let left_over = BLOCK_SIZE - data.len();
vec.extend_from_slice(data);
vec.extend_from_slice(&vec![0; left_over]);
}
vec
};
let name_data_vec = {
let mut vec = Vec::new();
for data in name_bytes_vec {
let left_over = BLOCK_SIZE - data.len();
vec.extend_from_slice(data);
vec.extend_from_slice(&vec![0; left_over]);
}
vec
};
// 阻塞写
let _team_write_event =
unsafe { queue.enqueue_write_buffer(&mut team, CL_BLOCKING, 0, &team_data_vec, &[]) }?;
let _name_write_event =
unsafe { queue.enqueue_write_buffer(&mut name, CL_BLOCKING, 0, &name_data_vec, &[]) }?;
let _t_len_write_event =
unsafe { queue.enqueue_write_buffer(&mut t_len, CL_BLOCKING, 0, &t_len_vec, &[]) }?;
let _n_len_write_event =
unsafe { queue.enqueue_write_buffer(&mut n_len, CL_BLOCKING, 0, &n_len_vec, &[]) }?;
// println!("output: {:?} {}", output, output.len());
let kernel_event = unsafe {
ExecuteKernel::new(&kernel)
.set_arg(&team)
.set_arg(&t_len)
.set_arg(&name)
.set_arg(&n_len)
.set_arg_svm(output.as_mut_ptr())
.set_arg(&worker_count)
.set_global_work_size(worker_count as usize)
.enqueue_nd_range(&queue)?
};
kernel_event.wait()?;
queue.finish()?;
if !output.is_fine_grained() {
unsafe { queue.enqueue_svm_map(CL_BLOCKING, CL_MAP_WRITE, &mut output, &[]) }?;
}
let start_time = kernel_event.profiling_command_start()?;
let end_time = kernel_event.profiling_command_end()?;
let duration = end_time - start_time;
let time = std::time::Duration::from_nanos(duration as u64);
println!("kernel execution duration: {:?}", time);
let pre_sec = 1_000_000_000 as f32 / duration as f32;
println!("kernel execution speed (pre/sec): {:?}", pre_sec * worker_count as f32);
// println!("output: {:?} {}", output, output.len());
if !output.is_fine_grained() {
let unmap_event = unsafe { queue.enqueue_svm_unmap(&output, &[]) }?;
unmap_event.wait()?;
}
Ok(())
}

93
src/program.cl Normal file
View File

@ -0,0 +1,93 @@
// 取中值
uchar median(uchar a, uchar b, uchar c) {
return max(min(a, b), min(max(a, b), c));
}
// 输入: 1~255 长度的 u8 数组
// 输出: 255 长度的 u8 数组
kernel void load_team(
global const uchar* all_team_bytes,
global const int* all_t_len,
global const uchar* all_name_bytes,
global const int* all_n_len,
// 一个 svm [u8; 256] * worker_count
global uchar* all_val,
int worker_count
) {
int gid = get_global_id(0);
if (gid >= worker_count) {
return;
}
local uchar val[256];
local uchar team_bytes[256];
local uchar name_bytes[256];
for (int i = 0; i < 256; i++) {
val[i] = i;
}
for (int i = 0; i < 256; i += 4) {
vstore4(vload4(0, &all_team_bytes[256 * gid + i]), i, team_bytes);
vstore4(vload4(0, &all_name_bytes[256 * gid + i]), i, name_bytes);
}
int t_len = all_t_len[gid];
int n_len = all_n_len[gid];
// 外面初始化好了
uchar s = 0;
for (int i = s = 0; i < 256; ++i) {
if (i % t_len) {
s += team_bytes[i % t_len - 1];
}
s += val[i];
uchar tmp = val[i];
val[i] = val[s];
val[s] = tmp;
}
for (int _ = 0; _ < 2; _++) {
uchar s = 0;
uchar k = 0;
for (int i = 0; i < 256; i++) {
if (k != 0) {
s += name_bytes[k - 1];
}
s += val[i];
uchar tmp = val[i];
val[i] = val[s];
val[s] = tmp;
if (k == n_len) {
k = 0;
} else {
k++;
}
}
}
local uchar val_2[256];
for (int i = 0; i < 256; i++) {
val_2[i] = val[i] * 181 + 160;
}
local uchar name_nase[40];
local int b_counter;
b_counter = 0;
for (int i = 0; i < 256; i += 1) {
if (val_2[i] >= 89 && val_2[i] < 217) {
name_nase[b_counter] = val_2[i] & 63;
b_counter++;
if (b_counter >= 40) {
break;
}
}
}
// 将结果从局部内存拷贝回全局内存
// for (int i = 0; i < 256; i += 4) {
// vstore4(vload4(0, &val_2[i]), i, &all_val[256 * gid + i]);
// }
// 这里这么整一下, 防止他优化掉最后的这点东西
// all_val[256 * gid] = name_nase[0];
}