sekrit-twc · Stefan-Olt · Oct 10, 2024 · Oct 10, 2024
diff --git a/meson.build b/meson.build
@@ -0,0 +1,159 @@
+project('znedi3', 'c', 'cpp',
+        version: '2.2',
+        default_options: ['c_std=c99', 'cpp_std=c++14', 'buildtype=release', 'b_lto=true'],
+        meson_version: '>=0.46')
+
+
+warnings = [
+  '-Wall',
+  '-Wextra',
+  '-Wshadow',
+]
+
+cflags = [
+  warnings,
+  '-fvisibility=hidden',
+  '-DGRAPHENGINE_IMPL_NAMESPACE=znedi3',
+  '-DPACKAGE_VERSION="@0@"'.format(meson.project_version()),
+  '-DNDEBUG',
+  '-Igraphengine/include',
+  '-Iznedi3',
+  '-Ivsxx',
+  '-Ivsxx/vapoursynth',
+]
+
+ldflags = [
+]
+
+includes = include_directories(
+  'graphengine/include',
+  'znedi3',
+  'vsxx',
+  'vsxx/vapoursynth',
+)
+
+znedi3_sources = [
+  'znedi3/kernel.cpp',
+  'znedi3/weights.cpp',
+  'znedi3/znedi3.cpp',
+  'znedi3/znedi3_impl.cpp',
+]
+
+vsznedi3_sources = [
+  'vsznedi3/vsznedi3.cpp',
+  'vsxx/vsxx4_pluginmain.cpp',
+]
+
+graphengine_sources = [
+  'graphengine/graphengine/cpuinfo.cpp',
+  'graphengine/graphengine/graph.cpp',
+  'graphengine/graphengine/node.cpp',
+]
+
+testapp_sources = [
+  'testapp/argparse.cpp',
+  'testapp/main.cpp',
+  'testapp/mmap.cpp',
+  'testapp/win32_bitmap.cpp',
+]
+
+znedi3_helper_libs = []
+
+host_cpu_family = host_machine.cpu_family()
+
+if host_cpu_family.startswith('x86')
+  cflags += ['-mfpmath=sse', '-msse2', '-DZNEDI3_X86=1']
+
+  znedi3_sources += [
+    'znedi3/x86/cpuinfo_x86.cpp',
+    'znedi3/x86/kernel_x86.cpp',
+    'znedi3/x86/kernel_sse.cpp',
+    'znedi3/x86/kernel_sse2.cpp',
+  ]
+
+  znedi3_sources_x86_avx = [
+    'znedi3/x86/kernel_avx.cpp',
+  ]
+
+  znedi3_sources_x86_f16c = [
+    'znedi3/x86/kernel_f16c.cpp',
+  ]
+
+  znedi3_sources_x86_avx2 = [
+    'znedi3/x86/kernel_avx2.cpp',
+  ]
+
+  znedi3_sources_x86_avx512 = [
+    'znedi3/x86/kernel_avx512.cpp',
+  ]
+
+  znedi3_helper_libs += static_library('avx',
+                                znedi3_sources_x86_avx,
+                                include_directories: includes,
+                                cpp_args: [cflags, '-mavx', '-mtune=sandybridge'],
+                                pic: true,
+                                install: false)
+
+  znedi3_helper_libs += static_library('f16c',
+                                znedi3_sources_x86_f16c,
+                                include_directories: includes,
+                                cpp_args: [cflags, '-mavx', '-mf16c', '-mtune=ivybridge'],
+                                pic: true,
+                                install: false)
+
+  znedi3_helper_libs += static_library('avx2',
+                                znedi3_sources_x86_avx2,
+                                include_directories: includes,
+                                cpp_args: [cflags, '-mavx2', '-mfma', '-mtune=haswell'],
+                                pic: true,
+                                install: false)
+
+  znedi3_helper_libs += static_library('avx512',
+                                znedi3_sources_x86_avx2,
+                                include_directories: includes,
+                                cpp_args: [cflags, '-mavx512f', '-mfma', '-mtune=skylake-avx512'],
+                                pic: true,
+                                install: false)
+
+  graphengine_sources += [
+    'graphengine/graphengine/x86/cpuinfo_x86.cpp'
+  ]
+
+endif
+
+if host_cpu_family.startswith('aarch') or host_cpu_family.startswith('arm')
+  cflags += ['-DZNEDI3_ARM=1']
+
+  znedi3_sources += [
+    'znedi3/arm/kernel_arm.cpp',
+    'znedi3/arm/kernel_neon.cpp',
+  ]
+endif
+
+znedi3_helper_libs += static_library('graphengine',
+                              graphengine_sources,
+                              include_directories: includes,
+                              cpp_args: [cflags],
+                              pic: true,
+                              install: false)
+
+cxx = meson.get_compiler('cpp')
+
+
+shared_module('vsznedi3',
+              znedi3_sources + vsznedi3_sources,
+              include_directories: includes,
+              link_args: ldflags,
+              c_args: cflags,
+              cpp_args: cflags,
+              link_with: znedi3_helper_libs,
+              install: true)
+
+executable   ('testapp',
+              znedi3_sources + testapp_sources,
+              include_directories: includes,
+              link_args: ldflags,
+              c_args: cflags,
+              cpp_args: cflags,
+              link_with: znedi3_helper_libs,
+              install: false)
diff --git a/znedi3/arm/kernel_arm.cpp b/znedi3/arm/kernel_arm.cpp
@@ -0,0 +1,99 @@
+#ifdef ZNEDI3_ARM
+
+#include <algorithm>
+#include <cassert>
+#include "alloc.h"
+#include "kernel.h"
+#include "kernel_arm.h"
+#include "znedi3_impl.h"
+
+namespace znedi3 {
+namespace {
+
+pixel_io_func select_pixel_io_func_neon(PixelType in, PixelType out)
+{
+	if (in == PixelType::BYTE && out == PixelType::FLOAT)
+		return byte_to_float_sse2;
+	else if (in == PixelType::WORD && out == PixelType::FLOAT)
+		return word_to_float_sse2;
+	else if (in == PixelType::FLOAT && out == PixelType::BYTE)
+		return float_to_byte_sse2;
+	else if (in == PixelType::FLOAT && out == PixelType::WORD)
+		return float_to_word_sse2;
+	else
+		return nullptr;
+}
+
+
+} // namespace
+
+InterleavedPredictorModel create_interleaved_predictor_model(const PredictorModel &model)
+{
+	assert(model.first.nns % 16 == 0);
+
+	unsigned filter_size = model.first.xdim * model.first.ydim;
+	unsigned nns = model.first.nns;
+
+	PredictorModel m = copy_model(model);
+	subtract_mean(m);
+
+	InterleavedPredictorModel interleaved{};
+	interleaved.data.resize(nns * filter_size * 4 + nns * 4);
+
+	interleaved.xdim = m.first.xdim;
+	interleaved.ydim = m.first.ydim;
+	interleaved.nns = m.first.nns;
+
+	LinearAllocator alloc{ interleaved.data.data() };
+	interleaved.neurons_q1 = alloc.allocate_n<float>(nns * filter_size * 2);
+	interleaved.bias_q1 = alloc.allocate_n<float>(nns * 2);
+	interleaved.neurons_q2 = alloc.allocate_n<float>(nns * filter_size * 2);
+	interleaved.bias_q2 = alloc.allocate_n<float>(nns * 2);
+	assert(alloc.count() / sizeof(float) == interleaved.data.size());
+
+	for (unsigned k = 0; k < filter_size; ++k) {
+		for (unsigned nn = 0; nn < nns; ++nn) {
+			interleaved.neurons_q1[k * nns * 2 + nn] = m.second.softmax_q1[nn * filter_size + k];
+			interleaved.neurons_q1[k * nns * 2 + nn + nns] = m.second.elliott_q1[nn * filter_size + k];
+		}
+		for (unsigned nn = 0; nn < nns; ++nn) {
+			interleaved.neurons_q2[k * nns * 2 + nn] = m.second.softmax_q2[nn * filter_size + k];
+			interleaved.neurons_q2[k * nns * 2 + nn + nns] = m.second.elliott_q2[nn * filter_size + k];
+		}
+	}
+	std::copy_n(m.second.softmax_bias_q1, nns, interleaved.bias_q1);
+	std::copy_n(m.second.elliott_bias_q1, nns, interleaved.bias_q1 + nns);
+	std::copy_n(m.second.softmax_bias_q2, nns, interleaved.bias_q2);
+	std::copy_n(m.second.elliott_bias_q2, nns, interleaved.bias_q2 + nns);
+
+	return interleaved;
+}
+
+pixel_io_func select_pixel_io_func_arm(PixelType in, PixelType out, CPUClass cpu)
+{
+	return select_pixel_io_func_neon(in, out);
+}
+
+interpolate_func select_interpolate_func_arm(CPUClass cpu)
+{
+	return cubic_interpolation_sse2;
+}
+
+std::unique_ptr<Prescreener> create_prescreener_old_arm(const PrescreenerOldCoefficients &coeffs, double pixel_half, CPUClass cpu)
+{
+	return create_prescreener_old_sse(coeffs, pixel_half);
+}
+
+std::unique_ptr<Prescreener> create_prescreener_new_arm(const PrescreenerNewCoefficients &coeffs, double pixel_half, CPUClass cpu)
+{
+	return create_prescreener_new_sse(coeffs, pixel_half);
+}
+
+std::unique_ptr<Predictor> create_predictor_arm(const PredictorModel &model, bool use_q2, CPUClass cpu)
+{
+	return create_predictor_sse2(model, use_q2);
+}
+
+} // namespace znedi3
+
+#endif // ZNEDI3_ARM
diff --git a/znedi3/arm/kernel_arm.h b/znedi3/arm/kernel_arm.h
@@ -0,0 +1,88 @@
+#pragma once
+
+#ifdef ZNEDI3_ARM
+
+#ifndef ARM_KERNEL_ARM_H_
+#define ARM_KERNEL_ARM_H_
+
+#include <memory>
+#include <utility>
+#include "alloc.h"
+#include "kernel.h"
+#include "weights.h"
+
+namespace znedi3 {
+
+enum class CPUClass;
+enum class PixelType;
+
+// Polynomial coefficients for exp2f(x - 1) on the domain [1.0, 2.0].
+// Coefficients are stored low-order to high-order.
+constexpr float EXP2F_X_PLUS1_REMEZ[] = {
+	0.509871020343597804469416f,
+	0.312146713032169896138863f,
+	0.166617139319965966118107f,
+	-2.19061993049215080032874e-3f,
+	1.3555747234758484073940937e-2f
+};
+
+// Coefficients such that converting (EXPF_LN2_INV_SCALED * x + EXPF_ONE_SCALED)
+// to integer and reinterpreting the result as a float produces the integer
+// component of (x / ln(2)) in the exponent and the fractional component in the
+// mantissa.
+constexpr float EXPF_LN2_INV_SCALED = 12102203.1615614f; // (1.0 / (127 * ln(2))) * EXPF_ONE_SCALED.
+constexpr float EXPF_ONE_SCALED = 1065353216.f; // Integer representation of 1.0f.
+
+
+struct InterleavedPredictorModel {
+	AlignedVector<float> data;
+	unsigned xdim;
+	unsigned ydim;
+	unsigned nns;
+
+	// Filter coefficients are stored interleaved, such that all the
+	// coefficients for the n-th softmax neuron are stored contiguously,
+	// followed by all the coefficients for the n-th elliott neuron.
+	//
+	// f[nn=0][k=0] f[nn=1][k=0] f[nn=2][k=0] ... f[nn=nns*2][k=0]
+	// f[nn=0][k=1] f[nn=1][k=1] f[nn=2][k=1] ... f[nn=nns*2][k=1]
+	// ...
+	//
+	// Likewise, the softmax and elliott biases are stored contiguously.
+	float *neurons_q1;
+	float *bias_q1;
+	float *neurons_q2;
+	float *bias_q2;
+};
+
+InterleavedPredictorModel create_interleaved_predictor_model(const PredictorModel &model);
+
+
+// SSE
+std::unique_ptr<Prescreener> create_prescreener_old_sse(const PrescreenerOldCoefficients &coeffs, double pixel_half);
+std::unique_ptr<Prescreener> create_prescreener_new_sse(const PrescreenerNewCoefficients &coeffs, double pixel_half);
+
+// SSE2
+void byte_to_float_sse2(const void *src, void *dst, size_t n);
+void word_to_float_sse2(const void *src, void *dst, size_t n);
+void float_to_byte_sse2(const void *src, void *dst, size_t n);
+void float_to_word_sse2(const void *src, void *dst, size_t n);
+
+void cubic_interpolation_sse2(const float * const src[4], float *dst, const unsigned char *prescreen, unsigned n);
+
+std::unique_ptr<Predictor> create_predictor_sse2(const PredictorModel &model, bool use_q2);
+
+
+
+pixel_io_func select_pixel_io_func_arm(PixelType in, PixelType out, CPUClass cpu);
+interpolate_func select_interpolate_func_arm(CPUClass cpu);
+
+std::unique_ptr<Prescreener> create_prescreener_old_arm(const PrescreenerOldCoefficients &coeffs, double pixel_half, CPUClass cpu);
+std::unique_ptr<Prescreener> create_prescreener_new_arm(const PrescreenerNewCoefficients &coeffs, double pixel_half, CPUClass cpu);
+std::unique_ptr<Predictor> create_predictor_arm(const PredictorModel &model, bool use_q2, CPUClass cpu);
+
+} // namespace znedi3
+
+#endif // ARM_KERNEL_ARM_H_
+
+#endif // ZNEDI3_ARM