Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

meson build system and ARM neon optimizations #19

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
159 changes: 159 additions & 0 deletions meson.build
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
project('znedi3', 'c', 'cpp',
version: '2.2',
default_options: ['c_std=c99', 'cpp_std=c++14', 'buildtype=release', 'b_lto=true'],
meson_version: '>=0.46')


warnings = [
'-Wall',
'-Wextra',
'-Wshadow',
]

cflags = [
warnings,
'-fvisibility=hidden',
'-DGRAPHENGINE_IMPL_NAMESPACE=znedi3',
'-DPACKAGE_VERSION="@0@"'.format(meson.project_version()),
'-DNDEBUG',
'-Igraphengine/include',
'-Iznedi3',
'-Ivsxx',
'-Ivsxx/vapoursynth',
]

ldflags = [
]

includes = include_directories(
'graphengine/include',
'znedi3',
'vsxx',
'vsxx/vapoursynth',
)

znedi3_sources = [
'znedi3/kernel.cpp',
'znedi3/weights.cpp',
'znedi3/znedi3.cpp',
'znedi3/znedi3_impl.cpp',
]

vsznedi3_sources = [
'vsznedi3/vsznedi3.cpp',
'vsxx/vsxx4_pluginmain.cpp',
]

graphengine_sources = [
'graphengine/graphengine/cpuinfo.cpp',
'graphengine/graphengine/graph.cpp',
'graphengine/graphengine/node.cpp',
]

testapp_sources = [
'testapp/argparse.cpp',
'testapp/main.cpp',
'testapp/mmap.cpp',
'testapp/win32_bitmap.cpp',
]

znedi3_helper_libs = []

host_cpu_family = host_machine.cpu_family()

if host_cpu_family.startswith('x86')
cflags += ['-mfpmath=sse', '-msse2', '-DZNEDI3_X86=1']

znedi3_sources += [
'znedi3/x86/cpuinfo_x86.cpp',
'znedi3/x86/kernel_x86.cpp',
'znedi3/x86/kernel_sse.cpp',
'znedi3/x86/kernel_sse2.cpp',
]

znedi3_sources_x86_avx = [
'znedi3/x86/kernel_avx.cpp',
]

znedi3_sources_x86_f16c = [
'znedi3/x86/kernel_f16c.cpp',
]

znedi3_sources_x86_avx2 = [
'znedi3/x86/kernel_avx2.cpp',
]

znedi3_sources_x86_avx512 = [
'znedi3/x86/kernel_avx512.cpp',
]

znedi3_helper_libs += static_library('avx',
znedi3_sources_x86_avx,
include_directories: includes,
cpp_args: [cflags, '-mavx', '-mtune=sandybridge'],
pic: true,
install: false)

znedi3_helper_libs += static_library('f16c',
znedi3_sources_x86_f16c,
include_directories: includes,
cpp_args: [cflags, '-mavx', '-mf16c', '-mtune=ivybridge'],
pic: true,
install: false)

znedi3_helper_libs += static_library('avx2',
znedi3_sources_x86_avx2,
include_directories: includes,
cpp_args: [cflags, '-mavx2', '-mfma', '-mtune=haswell'],
pic: true,
install: false)

znedi3_helper_libs += static_library('avx512',
znedi3_sources_x86_avx2,
include_directories: includes,
cpp_args: [cflags, '-mavx512f', '-mfma', '-mtune=skylake-avx512'],
pic: true,
install: false)

graphengine_sources += [
'graphengine/graphengine/x86/cpuinfo_x86.cpp'
]

endif

if host_cpu_family.startswith('aarch') or host_cpu_family.startswith('arm')
cflags += ['-DZNEDI3_ARM=1']

znedi3_sources += [
'znedi3/arm/kernel_arm.cpp',
'znedi3/arm/kernel_neon.cpp',
]
endif

znedi3_helper_libs += static_library('graphengine',
graphengine_sources,
include_directories: includes,
cpp_args: [cflags],
pic: true,
install: false)

cxx = meson.get_compiler('cpp')


shared_module('vsznedi3',
znedi3_sources + vsznedi3_sources,
include_directories: includes,
link_args: ldflags,
c_args: cflags,
cpp_args: cflags,
link_with: znedi3_helper_libs,
install: true)

executable ('testapp',
znedi3_sources + testapp_sources,
include_directories: includes,
link_args: ldflags,
c_args: cflags,
cpp_args: cflags,
link_with: znedi3_helper_libs,
install: false)
99 changes: 99 additions & 0 deletions znedi3/arm/kernel_arm.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
#ifdef ZNEDI3_ARM

#include <algorithm>
#include <cassert>
#include "alloc.h"
#include "kernel.h"
#include "kernel_arm.h"
#include "znedi3_impl.h"

namespace znedi3 {
namespace {

pixel_io_func select_pixel_io_func_neon(PixelType in, PixelType out)
{
if (in == PixelType::BYTE && out == PixelType::FLOAT)
return byte_to_float_sse2;
else if (in == PixelType::WORD && out == PixelType::FLOAT)
return word_to_float_sse2;
else if (in == PixelType::FLOAT && out == PixelType::BYTE)
return float_to_byte_sse2;
else if (in == PixelType::FLOAT && out == PixelType::WORD)
return float_to_word_sse2;
else
return nullptr;
}


} // namespace

InterleavedPredictorModel create_interleaved_predictor_model(const PredictorModel &model)
{
assert(model.first.nns % 16 == 0);

unsigned filter_size = model.first.xdim * model.first.ydim;
unsigned nns = model.first.nns;

PredictorModel m = copy_model(model);
subtract_mean(m);

InterleavedPredictorModel interleaved{};
interleaved.data.resize(nns * filter_size * 4 + nns * 4);

interleaved.xdim = m.first.xdim;
interleaved.ydim = m.first.ydim;
interleaved.nns = m.first.nns;

LinearAllocator alloc{ interleaved.data.data() };
interleaved.neurons_q1 = alloc.allocate_n<float>(nns * filter_size * 2);
interleaved.bias_q1 = alloc.allocate_n<float>(nns * 2);
interleaved.neurons_q2 = alloc.allocate_n<float>(nns * filter_size * 2);
interleaved.bias_q2 = alloc.allocate_n<float>(nns * 2);
assert(alloc.count() / sizeof(float) == interleaved.data.size());

for (unsigned k = 0; k < filter_size; ++k) {
for (unsigned nn = 0; nn < nns; ++nn) {
interleaved.neurons_q1[k * nns * 2 + nn] = m.second.softmax_q1[nn * filter_size + k];
interleaved.neurons_q1[k * nns * 2 + nn + nns] = m.second.elliott_q1[nn * filter_size + k];
}
for (unsigned nn = 0; nn < nns; ++nn) {
interleaved.neurons_q2[k * nns * 2 + nn] = m.second.softmax_q2[nn * filter_size + k];
interleaved.neurons_q2[k * nns * 2 + nn + nns] = m.second.elliott_q2[nn * filter_size + k];
}
}
std::copy_n(m.second.softmax_bias_q1, nns, interleaved.bias_q1);
std::copy_n(m.second.elliott_bias_q1, nns, interleaved.bias_q1 + nns);
std::copy_n(m.second.softmax_bias_q2, nns, interleaved.bias_q2);
std::copy_n(m.second.elliott_bias_q2, nns, interleaved.bias_q2 + nns);

return interleaved;
}

pixel_io_func select_pixel_io_func_arm(PixelType in, PixelType out, CPUClass cpu)
{
return select_pixel_io_func_neon(in, out);
}

interpolate_func select_interpolate_func_arm(CPUClass cpu)
{
return cubic_interpolation_sse2;
}

std::unique_ptr<Prescreener> create_prescreener_old_arm(const PrescreenerOldCoefficients &coeffs, double pixel_half, CPUClass cpu)
{
return create_prescreener_old_sse(coeffs, pixel_half);
}

std::unique_ptr<Prescreener> create_prescreener_new_arm(const PrescreenerNewCoefficients &coeffs, double pixel_half, CPUClass cpu)
{
return create_prescreener_new_sse(coeffs, pixel_half);
}

std::unique_ptr<Predictor> create_predictor_arm(const PredictorModel &model, bool use_q2, CPUClass cpu)
{
return create_predictor_sse2(model, use_q2);
}

} // namespace znedi3

#endif // ZNEDI3_ARM
88 changes: 88 additions & 0 deletions znedi3/arm/kernel_arm.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
#pragma once

#ifdef ZNEDI3_ARM

#ifndef ARM_KERNEL_ARM_H_
#define ARM_KERNEL_ARM_H_

#include <memory>
#include <utility>
#include "alloc.h"
#include "kernel.h"
#include "weights.h"

namespace znedi3 {

enum class CPUClass;
enum class PixelType;

// Polynomial coefficients for exp2f(x - 1) on the domain [1.0, 2.0].
// Coefficients are stored low-order to high-order.
constexpr float EXP2F_X_PLUS1_REMEZ[] = {
0.509871020343597804469416f,
0.312146713032169896138863f,
0.166617139319965966118107f,
-2.19061993049215080032874e-3f,
1.3555747234758484073940937e-2f
};

// Coefficients such that converting (EXPF_LN2_INV_SCALED * x + EXPF_ONE_SCALED)
// to integer and reinterpreting the result as a float produces the integer
// component of (x / ln(2)) in the exponent and the fractional component in the
// mantissa.
constexpr float EXPF_LN2_INV_SCALED = 12102203.1615614f; // (1.0 / (127 * ln(2))) * EXPF_ONE_SCALED.
constexpr float EXPF_ONE_SCALED = 1065353216.f; // Integer representation of 1.0f.


struct InterleavedPredictorModel {
AlignedVector<float> data;
unsigned xdim;
unsigned ydim;
unsigned nns;

// Filter coefficients are stored interleaved, such that all the
// coefficients for the n-th softmax neuron are stored contiguously,
// followed by all the coefficients for the n-th elliott neuron.
//
// f[nn=0][k=0] f[nn=1][k=0] f[nn=2][k=0] ... f[nn=nns*2][k=0]
// f[nn=0][k=1] f[nn=1][k=1] f[nn=2][k=1] ... f[nn=nns*2][k=1]
// ...
//
// Likewise, the softmax and elliott biases are stored contiguously.
float *neurons_q1;
float *bias_q1;
float *neurons_q2;
float *bias_q2;
};

InterleavedPredictorModel create_interleaved_predictor_model(const PredictorModel &model);


// SSE
std::unique_ptr<Prescreener> create_prescreener_old_sse(const PrescreenerOldCoefficients &coeffs, double pixel_half);
std::unique_ptr<Prescreener> create_prescreener_new_sse(const PrescreenerNewCoefficients &coeffs, double pixel_half);

// SSE2
void byte_to_float_sse2(const void *src, void *dst, size_t n);
void word_to_float_sse2(const void *src, void *dst, size_t n);
void float_to_byte_sse2(const void *src, void *dst, size_t n);
void float_to_word_sse2(const void *src, void *dst, size_t n);

void cubic_interpolation_sse2(const float * const src[4], float *dst, const unsigned char *prescreen, unsigned n);

std::unique_ptr<Predictor> create_predictor_sse2(const PredictorModel &model, bool use_q2);



pixel_io_func select_pixel_io_func_arm(PixelType in, PixelType out, CPUClass cpu);
interpolate_func select_interpolate_func_arm(CPUClass cpu);

std::unique_ptr<Prescreener> create_prescreener_old_arm(const PrescreenerOldCoefficients &coeffs, double pixel_half, CPUClass cpu);
std::unique_ptr<Prescreener> create_prescreener_new_arm(const PrescreenerNewCoefficients &coeffs, double pixel_half, CPUClass cpu);
std::unique_ptr<Predictor> create_predictor_arm(const PredictorModel &model, bool use_q2, CPUClass cpu);

} // namespace znedi3

#endif // ARM_KERNEL_ARM_H_

#endif // ZNEDI3_ARM
Loading