From 6edea3f6bd631ebbce7e429c684e992dbf60f6e5 Mon Sep 17 00:00:00 2001 From: Niklas P Andersson <3985238+niklaspandersson@users.noreply.github.com> Date: Mon, 3 Jun 2024 14:28:34 +0200 Subject: [PATCH 1/6] Use v210 as 10bit pixel format on decklink --- .../decklink/consumer/decklink_consumer.cpp | 100 +++++++++++++++--- src/modules/decklink/consumer/frame.cpp | 36 +++++-- src/modules/decklink/consumer/frame.h | 4 +- src/modules/decklink/decklink_api.h | 19 ++++ 4 files changed, 129 insertions(+), 30 deletions(-) diff --git a/src/modules/decklink/consumer/decklink_consumer.cpp b/src/modules/decklink/consumer/decklink_consumer.cpp index 2bd1aa4c91..846d3c50ec 100644 --- a/src/modules/decklink/consumer/decklink_consumer.cpp +++ b/src/modules/decklink/consumer/decklink_consumer.cpp @@ -222,6 +222,7 @@ class decklink_frame , public IDeckLinkVideoFrameMetadataExtensions { core::video_format_desc format_desc_; + BMDPixelFormat pix_fmt_; std::shared_ptr data_; std::atomic ref_count_{0}; int nb_samples_; @@ -229,18 +230,39 @@ class decklink_frame core::color_space color_space_; hdr_meta_configuration hdr_metadata_; BMDFrameFlags flags_; - BMDPixelFormat pix_fmt_; public: - decklink_frame(std::shared_ptr data, core::video_format_desc format_desc, int nb_samples, bool hdr, core::color_space color_space, const hdr_meta_configuration& hdr_metadata) + decklink_frame(core::video_format_desc format_desc, + int nb_samples, + bool hdr, + core::color_space color_space, + const hdr_meta_configuration& hdr_metadata, + BMDPixelFormat pix_fmt, + std::shared_ptr data) : format_desc_(std::move(format_desc)) + , pix_fmt_(pix_fmt) , data_(std::move(data)) , nb_samples_(nb_samples) , hdr_(hdr) , color_space_(color_space) , hdr_metadata_(hdr_metadata) , flags_(hdr ? bmdFrameFlagDefault | bmdFrameContainsHDRMetadata : bmdFrameFlagDefault) + { + } + + decklink_frame(core::video_format_desc format_desc, + int nb_samples, + bool hdr, + core::color_space color_space, + const hdr_meta_configuration& hdr_metadata) + : format_desc_(std::move(format_desc)) , pix_fmt_(get_pixel_format(hdr)) + , data_(allocate_frame_data(format_desc, pix_fmt_)) + , nb_samples_(nb_samples) + , hdr_(hdr) + , color_space_(color_space) + , hdr_metadata_(hdr_metadata) + , flags_(hdr ? bmdFrameFlagDefault | bmdFrameContainsHDRMetadata : bmdFrameFlagDefault) { } @@ -255,7 +277,7 @@ class decklink_frame #else REFIID iunknown = IID_IUnknown; #endif - HRESULT result = E_NOINTERFACE; + HRESULT result = E_NOINTERFACE; if (ppv == nullptr) return E_INVALIDARG; @@ -292,9 +314,12 @@ class decklink_frame // IDecklinkVideoFrame - long STDMETHODCALLTYPE GetWidth() override { return static_cast(format_desc_.width); } - long STDMETHODCALLTYPE GetHeight() override { return static_cast(format_desc_.height); } - long STDMETHODCALLTYPE GetRowBytes() override { return static_cast(get_row_bytes(format_desc_, hdr_)); } + long STDMETHODCALLTYPE GetWidth() override { return static_cast(format_desc_.width); } + long STDMETHODCALLTYPE GetHeight() override { return static_cast(format_desc_.height); } + long STDMETHODCALLTYPE GetRowBytes() override + { + return static_cast(get_row_bytes(pix_fmt_, format_desc_.width)); + } BMDPixelFormat STDMETHODCALLTYPE GetPixelFormat() override { return pix_fmt_; } BMDFrameFlags STDMETHODCALLTYPE GetFlags() override { return flags_; } @@ -338,7 +363,7 @@ class decklink_frame HRESULT STDMETHODCALLTYPE GetFloat(BMDDeckLinkFrameMetadataID metadataID, double* value) { const auto color_space = (color_space_ == core::color_space::bt2020) ? &REC_2020 : &REC_709; - HRESULT result = S_OK; + HRESULT result = S_OK; switch (metadataID) { case bmdDeckLinkFrameMetadataHDRDisplayPrimariesRedX: @@ -438,7 +463,7 @@ struct decklink_secondary_port final : public IDeckLinkVideoOutputCallback const core::video_format_desc decklink_format_desc_; com_ptr mode_ = get_display_mode(output_, decklink_format_desc_.format, - get_pixel_format(config_.hdr), + config_.hdr ? bmdFormat10BitRGBXLE : bmdFormat8BitBGRA, bmdSupportedVideoModeDefault, config_.hdr); @@ -555,7 +580,13 @@ struct decklink_secondary_port final : public IDeckLinkVideoOutputCallback void schedule_next_video(std::shared_ptr image_data, int nb_samples, BMDTimeValue display_time) { auto packed_frame = wrap_raw( - new decklink_frame(std::move(image_data), decklink_format_desc_, nb_samples, config_.hdr, core::color_space::bt709, config_.hdr_meta)); + new decklink_frame(decklink_format_desc_, + nb_samples, + config_.hdr, + core::color_space::bt709, + config_.hdr_meta, + config_.hdr ? bmdFormat10BitRGBXLE : bmdFormat8BitBGRA, + std::move(image_data))); if (FAILED(output_->ScheduleVideoFrame(get_raw(packed_frame), display_time, decklink_format_desc_.duration, @@ -624,6 +655,8 @@ struct decklink_consumer final : public IDeckLinkVideoOutputCallback bmdSupportedVideoModeDefault, config_.hdr); + com_ptr video_conversion_; + std::atomic abort_request_{false}; public: @@ -640,6 +673,10 @@ struct decklink_consumer final : public IDeckLinkVideoOutputCallback graph_->set_color("buffered-audio", diagnostics::color(0.9f, 0.9f, 0.5f)); graph_->set_color("buffered-video", diagnostics::color(0.2f, 0.9f, 0.9f)); + if (config_.hdr) { + video_conversion_ = create_video_converter(); + } + if (config.duplex != configuration::duplex_t::default_duplex) { set_duplex(iface_cast(decklink_), iface_cast(decklink_), @@ -708,11 +745,12 @@ struct decklink_consumer final : public IDeckLinkVideoOutputCallback nb_samples); } - std::shared_ptr image_data = allocate_frame_data(decklink_format_desc_, config_.hdr); + std::shared_ptr rgb_image_data = + allocate_frame_data(decklink_format_desc_, config_.hdr ? bmdFormat10BitRGBXLE : bmdFormat8BitBGRA); - schedule_next_video(image_data, nb_samples, video_scheduled_, config_.hdr_meta.default_color_space); + schedule_next_video(rgb_image_data, nb_samples, video_scheduled_, config_.hdr_meta.default_color_space); for (auto& context : secondary_port_contexts_) { - context->schedule_next_video(image_data, 0, video_scheduled_); + context->schedule_next_video(rgb_image_data, 0, video_scheduled_); } video_scheduled_ += decklink_format_desc_.duration; @@ -932,7 +970,8 @@ struct decklink_consumer final : public IDeckLinkVideoOutputCallback mode_->GetFieldDominance(), config_.hdr); - schedule_next_video(image_data, nb_samples, video_display_time, frame1.pixel_format_desc().color_space); + schedule_next_video( + image_data, nb_samples, video_display_time, frame1.pixel_format_desc().color_space); if (config_.embedded_audio) { schedule_next_audio(std::move(audio_data), nb_samples); @@ -992,12 +1031,39 @@ struct decklink_consumer final : public IDeckLinkVideoOutputCallback audio_scheduled_ += nb_samples; // TODO - what if there are too many/few samples in this frame? } - void schedule_next_video(std::shared_ptr image_data, int nb_samples, BMDTimeValue display_time, core::color_space color_space) + void schedule_next_video(std::shared_ptr image_data, + int nb_samples, + BMDTimeValue display_time, + core::color_space color_space) { - auto fill_frame = wrap_raw( - new decklink_frame(std::move(image_data), decklink_format_desc_, nb_samples, config_.hdr, color_space, config_.hdr_meta)); + auto rgb_frame = wrap_raw( + new decklink_frame(decklink_format_desc_, + nb_samples, + config_.hdr, + color_space, + config_.hdr_meta, + config_.hdr ? bmdFormat10BitRGBXLE : bmdFormat8BitBGRA, + std::move(image_data))); + + if (config_.hdr) { + auto yuv_frame = wrap_raw( + new decklink_frame(decklink_format_desc_, nb_samples, config_.hdr, color_space, config_.hdr_meta)); + + if (FAILED(video_conversion_->ConvertFrame(get_raw(rgb_frame), get_raw(yuv_frame)))) { + CASPAR_LOG(warning) << print() << L" Failed to convert video frame."; + } + + if (FAILED(output_->ScheduleVideoFrame(get_raw(yuv_frame), + display_time, + decklink_format_desc_.duration, + decklink_format_desc_.time_scale))) { + CASPAR_LOG(error) << print() << L" Failed to schedule primary video."; + } + return; + } + if (FAILED(output_->ScheduleVideoFrame( - get_raw(fill_frame), display_time, decklink_format_desc_.duration, decklink_format_desc_.time_scale))) { + get_raw(rgb_frame), display_time, decklink_format_desc_.duration, decklink_format_desc_.time_scale))) { CASPAR_LOG(error) << print() << L" Failed to schedule primary video."; } } diff --git a/src/modules/decklink/consumer/frame.cpp b/src/modules/decklink/consumer/frame.cpp index bb05ea2835..23c64b7f13 100644 --- a/src/modules/decklink/consumer/frame.cpp +++ b/src/modules/decklink/consumer/frame.cpp @@ -30,16 +30,26 @@ namespace caspar { namespace decklink { -BMDPixelFormat get_pixel_format(bool hdr) { return hdr ? bmdFormat10BitRGBXLE : bmdFormat8BitBGRA; } -int get_row_bytes(const core::video_format_desc& format_desc, bool hdr) +BMDPixelFormat get_pixel_format(bool hdr) { return hdr ? bmdFormat10BitYUV : bmdFormat8BitBGRA; } + +int get_row_bytes(BMDPixelFormat pix_fmt, int width) { - return hdr ? ((format_desc.width + 63) / 64) * 256 : format_desc.width * 4; + switch (pix_fmt) { + case bmdFormat10BitYUV: + return ((width + 47) / 48) * 128; + case bmdFormat10BitRGBXLE: + return ((width + 63) / 64) * 256; + default: + break; + } + + return width * 4; } -std::shared_ptr allocate_frame_data(const core::video_format_desc& format_desc, bool hdr) +std::shared_ptr allocate_frame_data(const core::video_format_desc& format_desc, BMDPixelFormat pix_fmt) { - auto alignment = hdr ? 256 : 64; - auto size = hdr ? get_row_bytes(format_desc, hdr) * format_desc.height : format_desc.size; + auto alignment = 256; + auto size = get_row_bytes(pix_fmt, format_desc.width) * format_desc.height; return create_aligned_buffer(size, alignment); } @@ -74,7 +84,7 @@ void convert_frame(const core::video_format_desc& channel_format_desc, // Pack eight byte R16G16B16A16 pixels as four byte 10bit RGB R10G10B10XX const int NUM_THREADS = 4; auto rows_per_thread = decklink_format_desc.height / NUM_THREADS; - size_t byte_count_line = get_row_bytes(decklink_format_desc, hdr); + size_t byte_count_line = get_row_bytes(bmdFormat10BitRGBXLE, decklink_format_desc.width); tbb::parallel_for(0, NUM_THREADS, [&](int i) { auto end = (i + 1) * rows_per_thread; for (int y = firstLine + i * rows_per_thread; y < end; y += decklink_format_desc.field_count) { @@ -82,9 +92,12 @@ void convert_frame(const core::video_format_desc& channel_format_desc, for (int x = 0; x < decklink_format_desc.width; x += 1) { auto src = reinterpret_cast( frame.image_data(0).data() + (long long)y * decklink_format_desc.width * 8 + x * 8); - uint16_t blue = src[0] >> 6; - uint16_t green = src[1] >> 6; - uint16_t red = src[2] >> 6; + + // Scale down to 10 bit and convert to video range to get a valid + // v210 value after the decklink conversion + uint32_t blue = (src[0] >> 6) * 876 / 1024 + 64; + uint32_t green = (src[1] >> 6) * 876 / 1024 + 64; + uint32_t red = (src[2] >> 6) * 876 / 1024 + 64; dest[x] = ((uint32_t)(red) << 22) + ((uint32_t)(green) << 12) + ((uint32_t)(blue) << 2); } } @@ -175,7 +188,8 @@ std::shared_ptr convert_frame_for_port(const core::video_format_desc& chan BMDFieldDominance field_dominance, bool hdr) { - std::shared_ptr image_data = allocate_frame_data(decklink_format_desc, hdr); + std::shared_ptr image_data = + allocate_frame_data(decklink_format_desc, hdr ? bmdFormat10BitRGBXLE : bmdFormat8BitBGRA); if (field_dominance != bmdProgressiveFrame) { convert_frame(channel_format_desc, diff --git a/src/modules/decklink/consumer/frame.h b/src/modules/decklink/consumer/frame.h index e96109c4ca..41bd3f3fa7 100644 --- a/src/modules/decklink/consumer/frame.h +++ b/src/modules/decklink/consumer/frame.h @@ -35,9 +35,9 @@ namespace caspar { namespace decklink { BMDPixelFormat get_pixel_format(bool hdr); -int get_row_bytes(const core::video_format_desc& format_desc, bool hdr); +int get_row_bytes(BMDPixelFormat pix_fmt, int width); -std::shared_ptr allocate_frame_data(const core::video_format_desc& format_desc, bool hdr); +std::shared_ptr allocate_frame_data(const core::video_format_desc& format_desc, BMDPixelFormat pix_fmt); std::shared_ptr convert_frame_for_port(const core::video_format_desc& channel_format_desc, const core::video_format_desc& decklink_format_desc, diff --git a/src/modules/decklink/decklink_api.h b/src/modules/decklink/decklink_api.h index 1749976c61..1c7dc0146b 100644 --- a/src/modules/decklink/decklink_api.h +++ b/src/modules/decklink/decklink_api.h @@ -82,6 +82,15 @@ static com_ptr create_iterator() return pDecklinkIterator; } +static com_ptr create_video_converter() +{ + CComPtr pVideoConversion_; + if (FAILED(pVideoConversion_.CoCreateInstance(CLSID_CDeckLinkVideoConversion))) + CASPAR_THROW_EXCEPTION(not_supported() << msg_info("Could not create video converter.")); + + return pVideoConversion_; +} + template static com_iface_ptr iface_cast(const com_ptr& ptr, bool optional = false) { @@ -164,6 +173,16 @@ static com_ptr create_iterator() return wrap_raw(iterator, true); } +static com_ptr create_video_converter() +{ + IDeckLinkVideoConversion* converter = CreateVideoConversionInstance(); + + if (converter == nullptr) + CASPAR_THROW_EXCEPTION(not_supported() << msg_info("Could not create video converter.")); + + return wrap_raw(converter, true); +} + template static REFIID iface_id() { From 02299239571ad25b9397a1a1cc6cc319907cdc79 Mon Sep 17 00:00:00 2001 From: Niklas P Andersson <3985238+niklaspandersson@users.noreply.github.com> Date: Fri, 14 Jun 2024 13:32:59 +0200 Subject: [PATCH 2/6] simd optimize 16 to 10 bit conversion --- src/modules/decklink/consumer/frame.cpp | 38 +++++++++++++++++++------ 1 file changed, 30 insertions(+), 8 deletions(-) diff --git a/src/modules/decklink/consumer/frame.cpp b/src/modules/decklink/consumer/frame.cpp index 23c64b7f13..e96144e590 100644 --- a/src/modules/decklink/consumer/frame.cpp +++ b/src/modules/decklink/consumer/frame.cpp @@ -46,6 +46,25 @@ int get_row_bytes(BMDPixelFormat pix_fmt, int width) return width * 4; } +inline unsigned int pack_pixel(__m128i pixel) { + // Scale down to 10 bit and convert to video range to get a valid + // v210 value after the decklink conversion + // formula: scaled_channel = (src >> 6) * 876 / 1024 + 64; + + __m128i bit32 = _mm_unpacklo_epi16(pixel, _mm_setzero_si128()); // unpack 16 bit components to 32 bit + __m128i bit10 = _mm_srli_epi32(bit32, 6); // shift down to 10 bit precision + bit10 = _mm_mullo_epi32(bit10, _mm_set1_epi32(876)); // multiply by 876 + bit10 = _mm_srli_epi32(bit10, 10); // divide by 1024 + bit10 = _mm_add_epi32(bit10, _mm_set1_epi32(64)); // add 64 + + // Extract the 10 bit components and save to dest + uint32_t blue = _mm_extract_epi32(bit10, 0); + uint32_t green = _mm_extract_epi32(bit10, 1); + uint32_t red = _mm_extract_epi32(bit10, 2); + + return (red << 22) + (green << 12) + (blue << 2); +} + std::shared_ptr allocate_frame_data(const core::video_format_desc& format_desc, BMDPixelFormat pix_fmt) { auto alignment = 256; @@ -82,23 +101,26 @@ void convert_frame(const core::video_format_desc& channel_format_desc, if (hdr) { // Pack eight byte R16G16B16A16 pixels as four byte 10bit RGB R10G10B10XX - const int NUM_THREADS = 4; + const int NUM_THREADS = 8; auto rows_per_thread = decklink_format_desc.height / NUM_THREADS; size_t byte_count_line = get_row_bytes(bmdFormat10BitRGBXLE, decklink_format_desc.width); tbb::parallel_for(0, NUM_THREADS, [&](int i) { auto end = (i + 1) * rows_per_thread; for (int y = firstLine + i * rows_per_thread; y < end; y += decklink_format_desc.field_count) { auto dest = reinterpret_cast(image_data.get()) + (long long)y * byte_count_line / 4; - for (int x = 0; x < decklink_format_desc.width; x += 1) { + __m128i zero = _mm_setzero_si128(); + __m128i fac = _mm_set1_epi32(876); + __m128i offset = _mm_set1_epi32(64); + + for (int x = 0; x < decklink_format_desc.width; x += 2) { auto src = reinterpret_cast( frame.image_data(0).data() + (long long)y * decklink_format_desc.width * 8 + x * 8); - // Scale down to 10 bit and convert to video range to get a valid - // v210 value after the decklink conversion - uint32_t blue = (src[0] >> 6) * 876 / 1024 + 64; - uint32_t green = (src[1] >> 6) * 876 / 1024 + 64; - uint32_t red = (src[2] >> 6) * 876 / 1024 + 64; - dest[x] = ((uint32_t)(red) << 22) + ((uint32_t)(green) << 12) + ((uint32_t)(blue) << 2); + // SIMD optimized + // Load two pixels at once to stay on 16-byte aligned memory + __m128i pixels = _mm_load_si128(reinterpret_cast(src)); + dest[x] = pack_pixel(_mm_unpacklo_epi64(pixels, zero)); + dest[x + 1] = pack_pixel(_mm_unpackhi_epi64(pixels, zero)); } } }); From a845152fca4a6dc6e99c7a319a3fc6e0885c13bf Mon Sep 17 00:00:00 2001 From: Niklas Andersson <3985238+niklaspandersson@users.noreply.github.com> Date: Tue, 18 Jun 2024 11:42:04 +0200 Subject: [PATCH 3/6] avx2 simd for 16 to 10 bit conversion --- src/CMakeModules/Bootstrap_Linux.cmake | 2 + src/CMakeModules/Bootstrap_Windows.cmake | 2 +- src/modules/decklink/consumer/frame.cpp | 71 ++++++++++++++---------- 3 files changed, 46 insertions(+), 29 deletions(-) diff --git a/src/CMakeModules/Bootstrap_Linux.cmake b/src/CMakeModules/Bootstrap_Linux.cmake index 483522e9e8..a87b9e05e6 100644 --- a/src/CMakeModules/Bootstrap_Linux.cmake +++ b/src/CMakeModules/Bootstrap_Linux.cmake @@ -121,7 +121,9 @@ endif() IF (CMAKE_SYSTEM_PROCESSOR MATCHES "(i[3-6]86|x64|x86_64|amd64|e2k)") ADD_COMPILE_OPTIONS (-msse3) ADD_COMPILE_OPTIONS (-mssse3) + ADD_COMPILE_OPTIONS (-mavx) ADD_COMPILE_OPTIONS (-msse4.1) + ADD_COMPILE_OPTIONS (-mavx2) ELSE () ADD_COMPILE_DEFINITIONS (USE_SIMDE) ENDIF () diff --git a/src/CMakeModules/Bootstrap_Windows.cmake b/src/CMakeModules/Bootstrap_Windows.cmake index e47d0d5a3f..eb42610c55 100644 --- a/src/CMakeModules/Bootstrap_Windows.cmake +++ b/src/CMakeModules/Bootstrap_Windows.cmake @@ -264,7 +264,7 @@ add_definitions(-D_WIN32_WINNT=0x601) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHa /Zi /W4 /WX /MP /fp:fast /Zm192 /FIcommon/compiler/vs/disable_silly_warnings.h") set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /D TBB_USE_ASSERT=1 /D TBB_USE_DEBUG /bigobj") -set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Oi /Ot /Gy /bigobj") +set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Oi /arch:AVX2 /Ot /Gy /bigobj") if (POLICY CMP0045) cmake_policy(SET CMP0045 OLD) diff --git a/src/modules/decklink/consumer/frame.cpp b/src/modules/decklink/consumer/frame.cpp index e96144e590..30f69d2fbe 100644 --- a/src/modules/decklink/consumer/frame.cpp +++ b/src/modules/decklink/consumer/frame.cpp @@ -46,25 +46,6 @@ int get_row_bytes(BMDPixelFormat pix_fmt, int width) return width * 4; } -inline unsigned int pack_pixel(__m128i pixel) { - // Scale down to 10 bit and convert to video range to get a valid - // v210 value after the decklink conversion - // formula: scaled_channel = (src >> 6) * 876 / 1024 + 64; - - __m128i bit32 = _mm_unpacklo_epi16(pixel, _mm_setzero_si128()); // unpack 16 bit components to 32 bit - __m128i bit10 = _mm_srli_epi32(bit32, 6); // shift down to 10 bit precision - bit10 = _mm_mullo_epi32(bit10, _mm_set1_epi32(876)); // multiply by 876 - bit10 = _mm_srli_epi32(bit10, 10); // divide by 1024 - bit10 = _mm_add_epi32(bit10, _mm_set1_epi32(64)); // add 64 - - // Extract the 10 bit components and save to dest - uint32_t blue = _mm_extract_epi32(bit10, 0); - uint32_t green = _mm_extract_epi32(bit10, 1); - uint32_t red = _mm_extract_epi32(bit10, 2); - - return (red << 22) + (green << 12) + (blue << 2); -} - std::shared_ptr allocate_frame_data(const core::video_format_desc& format_desc, BMDPixelFormat pix_fmt) { auto alignment = 256; @@ -105,22 +86,56 @@ void convert_frame(const core::video_format_desc& channel_format_desc, auto rows_per_thread = decklink_format_desc.height / NUM_THREADS; size_t byte_count_line = get_row_bytes(bmdFormat10BitRGBXLE, decklink_format_desc.width); tbb::parallel_for(0, NUM_THREADS, [&](int i) { - auto end = (i + 1) * rows_per_thread; + auto end = (i + 1) * rows_per_thread; + __m256i zero = _mm256_setzero_si256(); + __m256i fac = _mm256_set1_epi32(876); + __m256i offset = _mm256_set1_epi32(64); for (int y = firstLine + i * rows_per_thread; y < end; y += decklink_format_desc.field_count) { auto dest = reinterpret_cast(image_data.get()) + (long long)y * byte_count_line / 4; - __m128i zero = _mm_setzero_si128(); - __m128i fac = _mm_set1_epi32(876); - __m128i offset = _mm_set1_epi32(64); - for (int x = 0; x < decklink_format_desc.width; x += 2) { + for (int x = 0; x < decklink_format_desc.width; x += 4) { auto src = reinterpret_cast( frame.image_data(0).data() + (long long)y * decklink_format_desc.width * 8 + x * 8); // SIMD optimized - // Load two pixels at once to stay on 16-byte aligned memory - __m128i pixels = _mm_load_si128(reinterpret_cast(src)); - dest[x] = pack_pixel(_mm_unpacklo_epi64(pixels, zero)); - dest[x + 1] = pack_pixel(_mm_unpackhi_epi64(pixels, zero)); + // Load four pixels at once (16x4 = 64, 64 x 4 = 256 bytes) + __m256i pixels = _mm256_load_si256(reinterpret_cast(src)); + + __m256i pixel13 = _mm256_unpacklo_epi16(pixels, zero); + __m256i pixel24 = _mm256_unpackhi_epi16(pixels, zero); + + pixel13 = _mm256_srli_epi32(pixel13, 6); // shift down to 10 bit precision + pixel24 = _mm256_srli_epi32(pixel24, 6); // shift down to 10 bit precision + + pixel13 = _mm256_mullo_epi32(pixel13, fac); // multiply by 876 + pixel24 = _mm256_mullo_epi32(pixel24, fac); // multiply by 876 + + pixel13 = _mm256_srli_epi32(pixel13, 10); // divide by 1024 + pixel24 = _mm256_srli_epi32(pixel24, 10); // divide by 1024 + + pixel13 = _mm256_add_epi32(pixel13, offset); // add 64 + pixel24 = _mm256_add_epi32(pixel24, offset); // add 64 + + // extract the R, G and B components + __m256i blue_green = _mm256_unpacklo_epi32(pixel13, pixel24); + __m256i red_alpha = _mm256_unpackhi_epi32(pixel13, pixel24); + __m128i bg_low = _mm256_extracti128_si256(blue_green, 0); + __m128i bg_high = _mm256_extracti128_si256(blue_green, 1); + __m128i blue = _mm_unpacklo_epi64(bg_low, bg_high); + __m128i green = _mm_unpackhi_epi64(bg_low, bg_high); + __m128i red = _mm_unpacklo_epi64(_mm256_extracti128_si256(red_alpha, 0), + _mm256_extracti128_si256(red_alpha, 1)); + + // shift each component to their correct position in R10G10B10XX + red = _mm_slli_epi32(red, 22); + green = _mm_slli_epi32(green, 12); + blue = _mm_slli_epi32(blue, 2); + + // combine the components + __m128i result = _mm_add_epi32(_mm_add_epi32(red, green), blue); + + // store all four pixels at once + _mm_store_si128(reinterpret_cast<__m128i*>(&dest[x]), result); } } }); From 16cd34bc05744886d5ff538df0a285b84ca0ab06 Mon Sep 17 00:00:00 2001 From: Niklas Andersson <3985238+niklaspandersson@users.noreply.github.com> Date: Wed, 14 Aug 2024 13:28:12 +0200 Subject: [PATCH 4/6] Enfoce alignment of empty const_frames --- src/accelerator/ogl/image/image_mixer.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/accelerator/ogl/image/image_mixer.cpp b/src/accelerator/ogl/image/image_mixer.cpp index f99777eacb..6f7e92f788 100644 --- a/src/accelerator/ogl/image/image_mixer.cpp +++ b/src/accelerator/ogl/image/image_mixer.cpp @@ -26,6 +26,8 @@ #include "../util/device.h" #include "../util/texture.h" +#include + #include #include #include @@ -91,7 +93,7 @@ class image_renderer const core::video_format_desc& format_desc) { if (layers.empty()) { // Bypass GPU with empty frame. - static const std::vector buffer(max_frame_size_, 0); + static const std::vector> buffer(max_frame_size_, 0); return make_ready_future(array(buffer.data(), format_desc.size, true)); } From d6c55dd5626f88d45f038a5d83b255ffac6c2aa5 Mon Sep 17 00:00:00 2001 From: Niklas Andersson <3985238+niklaspandersson@users.noreply.github.com> Date: Wed, 21 Aug 2024 09:15:20 +0200 Subject: [PATCH 5/6] WIP: custom v210 encoding --- .../decklink/consumer/decklink_consumer.cpp | 41 +--- src/modules/decklink/consumer/frame.cpp | 203 +++++++++++++----- 2 files changed, 164 insertions(+), 80 deletions(-) diff --git a/src/modules/decklink/consumer/decklink_consumer.cpp b/src/modules/decklink/consumer/decklink_consumer.cpp index 846d3c50ec..a81feee486 100644 --- a/src/modules/decklink/consumer/decklink_consumer.cpp +++ b/src/modules/decklink/consumer/decklink_consumer.cpp @@ -463,7 +463,7 @@ struct decklink_secondary_port final : public IDeckLinkVideoOutputCallback const core::video_format_desc decklink_format_desc_; com_ptr mode_ = get_display_mode(output_, decklink_format_desc_.format, - config_.hdr ? bmdFormat10BitRGBXLE : bmdFormat8BitBGRA, + config_.hdr ? bmdFormat10BitYUV : bmdFormat8BitBGRA, bmdSupportedVideoModeDefault, config_.hdr); @@ -585,7 +585,7 @@ struct decklink_secondary_port final : public IDeckLinkVideoOutputCallback config_.hdr, core::color_space::bt709, config_.hdr_meta, - config_.hdr ? bmdFormat10BitRGBXLE : bmdFormat8BitBGRA, + config_.hdr ? bmdFormat10BitYUV : bmdFormat8BitBGRA, std::move(image_data))); if (FAILED(output_->ScheduleVideoFrame(get_raw(packed_frame), display_time, @@ -655,8 +655,6 @@ struct decklink_consumer final : public IDeckLinkVideoOutputCallback bmdSupportedVideoModeDefault, config_.hdr); - com_ptr video_conversion_; - std::atomic abort_request_{false}; public: @@ -673,10 +671,6 @@ struct decklink_consumer final : public IDeckLinkVideoOutputCallback graph_->set_color("buffered-audio", diagnostics::color(0.9f, 0.9f, 0.5f)); graph_->set_color("buffered-video", diagnostics::color(0.2f, 0.9f, 0.9f)); - if (config_.hdr) { - video_conversion_ = create_video_converter(); - } - if (config.duplex != configuration::duplex_t::default_duplex) { set_duplex(iface_cast(decklink_), iface_cast(decklink_), @@ -745,12 +739,12 @@ struct decklink_consumer final : public IDeckLinkVideoOutputCallback nb_samples); } - std::shared_ptr rgb_image_data = - allocate_frame_data(decklink_format_desc_, config_.hdr ? bmdFormat10BitRGBXLE : bmdFormat8BitBGRA); + std::shared_ptr image_data = + allocate_frame_data(decklink_format_desc_, config_.hdr ? bmdFormat10BitYUV : bmdFormat8BitBGRA); - schedule_next_video(rgb_image_data, nb_samples, video_scheduled_, config_.hdr_meta.default_color_space); + schedule_next_video(image_data, nb_samples, video_scheduled_, config_.hdr_meta.default_color_space); for (auto& context : secondary_port_contexts_) { - context->schedule_next_video(rgb_image_data, 0, video_scheduled_); + context->schedule_next_video(image_data, 0, video_scheduled_); } video_scheduled_ += decklink_format_desc_.duration; @@ -1036,34 +1030,17 @@ struct decklink_consumer final : public IDeckLinkVideoOutputCallback BMDTimeValue display_time, core::color_space color_space) { - auto rgb_frame = wrap_raw( + auto frame = wrap_raw( new decklink_frame(decklink_format_desc_, nb_samples, config_.hdr, color_space, config_.hdr_meta, - config_.hdr ? bmdFormat10BitRGBXLE : bmdFormat8BitBGRA, + config_.hdr ? bmdFormat10BitYUV : bmdFormat8BitBGRA, std::move(image_data))); - if (config_.hdr) { - auto yuv_frame = wrap_raw( - new decklink_frame(decklink_format_desc_, nb_samples, config_.hdr, color_space, config_.hdr_meta)); - - if (FAILED(video_conversion_->ConvertFrame(get_raw(rgb_frame), get_raw(yuv_frame)))) { - CASPAR_LOG(warning) << print() << L" Failed to convert video frame."; - } - - if (FAILED(output_->ScheduleVideoFrame(get_raw(yuv_frame), - display_time, - decklink_format_desc_.duration, - decklink_format_desc_.time_scale))) { - CASPAR_LOG(error) << print() << L" Failed to schedule primary video."; - } - return; - } - if (FAILED(output_->ScheduleVideoFrame( - get_raw(rgb_frame), display_time, decklink_format_desc_.duration, decklink_format_desc_.time_scale))) { + get_raw(frame), display_time, decklink_format_desc_.duration, decklink_format_desc_.time_scale))) { CASPAR_LOG(error) << print() << L" Failed to schedule primary video."; } } diff --git a/src/modules/decklink/consumer/frame.cpp b/src/modules/decklink/consumer/frame.cpp index 30f69d2fbe..3111c03dd0 100644 --- a/src/modules/decklink/consumer/frame.cpp +++ b/src/modules/decklink/consumer/frame.cpp @@ -30,6 +30,34 @@ namespace caspar { namespace decklink { +std::vector bt709{0.2126, 0.7152, 0.0722, -0.1146, -0.3854, 0.5, 0.5, -0.4542, -0.0458}; +std::vector create_int_matrix(const std::vector& matrix) +{ + static const float LumaRangeWidth = 876.f * (1024.f / 1023.f); // 876; + static const float ChromaRangeWidth = 896.f * (1024.f / 1023.f); // 896; + + std::vector color_matrix_f(matrix); + + color_matrix_f[0] *= LumaRangeWidth; + color_matrix_f[1] *= LumaRangeWidth; + color_matrix_f[2] *= LumaRangeWidth; + + color_matrix_f[3] *= ChromaRangeWidth; + color_matrix_f[4] *= ChromaRangeWidth; + color_matrix_f[5] *= ChromaRangeWidth; + color_matrix_f[6] *= ChromaRangeWidth; + color_matrix_f[7] *= ChromaRangeWidth; + color_matrix_f[8] *= ChromaRangeWidth; + + std::vector int_matrix(color_matrix_f.size()); + + transform(color_matrix_f.cbegin(), color_matrix_f.cend(), int_matrix.begin(), [](const float& f) { + return (int32_t)round(f * 1024.f); + }); + + return int_matrix; +}; + BMDPixelFormat get_pixel_format(bool hdr) { return hdr ? bmdFormat10BitYUV : bmdFormat8BitBGRA; } int get_row_bytes(BMDPixelFormat pix_fmt, int width) @@ -81,61 +109,140 @@ void convert_frame(const core::video_format_desc& channel_format_desc, // Fast path if (hdr) { - // Pack eight byte R16G16B16A16 pixels as four byte 10bit RGB R10G10B10XX + auto color_matrix = create_int_matrix(bt709); + + // Pack R16G16B16A16 as v210 const int NUM_THREADS = 8; auto rows_per_thread = decklink_format_desc.height / NUM_THREADS; - size_t byte_count_line = get_row_bytes(bmdFormat10BitRGBXLE, decklink_format_desc.width); + size_t byte_count_line = get_row_bytes(bmdFormat10BitYUV, decklink_format_desc.width); + int fullspeed_x = decklink_format_desc.width / 48; tbb::parallel_for(0, NUM_THREADS, [&](int i) { - auto end = (i + 1) * rows_per_thread; - __m256i zero = _mm256_setzero_si256(); - __m256i fac = _mm256_set1_epi32(876); - __m256i offset = _mm256_set1_epi32(64); + auto end = (i + 1) * rows_per_thread; + // __m128i luma_mult = _mm_set_epi16(4, 1, 16, 4, 1, 16, 0, 0); + // __m128i luma_shuf = _mm_set_epi8(-1, 0, 1, -1, 2, 3, 4, 5, -1, 6, 7, -1, 8, 9, 10, 11); + __m128i luma_mult = _mm_set_epi16(0, 0, 16, 1, 4, 16, 1, 4); + __m128i luma_shuf = _mm_set_epi8(11, 10, 9, 8, -1, 7, 6, -1, 5, 4, 3, 2, -1, 1, 0, -1); + + // __m128i chroma_mult = _mm_set_epi16(1, 16, 4, 1, 16, 4, 0, 0); + // __m128i chroma_shuf = _mm_set_epi8(0, 1, 2, 3, -1, 4, 5, -1, 6, 7, 8, 9, -1, 10, 11, -1); + + __m256i zero = _mm256_setzero_si256(); + __m256i y_offset = _mm256_set1_epi32(64 << 20); + __m256i c_offset = _mm256_set1_epi32(512 << 20); + __m128i yc_ctmp = _mm_set_epi32(0, color_matrix[2], color_matrix[1], color_matrix[0]); + __m128i cb_ctmp = _mm_set_epi32(0, color_matrix[5], color_matrix[4], color_matrix[3]); + __m128i cr_ctmp = _mm_set_epi32(0, color_matrix[8], color_matrix[7], color_matrix[6]); + + __m256i y_coeff = _mm256_set_m128i(yc_ctmp, yc_ctmp); + __m256i cb_coeff = _mm256_set_m128i(cb_ctmp, cb_ctmp); + __m256i cr_coeff = _mm256_set_m128i(cr_ctmp, cr_ctmp); for (int y = firstLine + i * rows_per_thread; y < end; y += decklink_format_desc.field_count) { auto dest = reinterpret_cast(image_data.get()) + (long long)y * byte_count_line / 4; + __m128i* v210_dest = reinterpret_cast<__m128i*>(dest); - for (int x = 0; x < decklink_format_desc.width; x += 4) { + for (int x = 0; x < fullspeed_x; x++) { auto src = reinterpret_cast( - frame.image_data(0).data() + (long long)y * decklink_format_desc.width * 8 + x * 8); - - // SIMD optimized - // Load four pixels at once (16x4 = 64, 64 x 4 = 256 bytes) - __m256i pixels = _mm256_load_si256(reinterpret_cast(src)); - - __m256i pixel13 = _mm256_unpacklo_epi16(pixels, zero); - __m256i pixel24 = _mm256_unpackhi_epi16(pixels, zero); - - pixel13 = _mm256_srli_epi32(pixel13, 6); // shift down to 10 bit precision - pixel24 = _mm256_srli_epi32(pixel24, 6); // shift down to 10 bit precision - - pixel13 = _mm256_mullo_epi32(pixel13, fac); // multiply by 876 - pixel24 = _mm256_mullo_epi32(pixel24, fac); // multiply by 876 - - pixel13 = _mm256_srli_epi32(pixel13, 10); // divide by 1024 - pixel24 = _mm256_srli_epi32(pixel24, 10); // divide by 1024 - - pixel13 = _mm256_add_epi32(pixel13, offset); // add 64 - pixel24 = _mm256_add_epi32(pixel24, offset); // add 64 - - // extract the R, G and B components - __m256i blue_green = _mm256_unpacklo_epi32(pixel13, pixel24); - __m256i red_alpha = _mm256_unpackhi_epi32(pixel13, pixel24); - __m128i bg_low = _mm256_extracti128_si256(blue_green, 0); - __m128i bg_high = _mm256_extracti128_si256(blue_green, 1); - __m128i blue = _mm_unpacklo_epi64(bg_low, bg_high); - __m128i green = _mm_unpackhi_epi64(bg_low, bg_high); - __m128i red = _mm_unpacklo_epi64(_mm256_extracti128_si256(red_alpha, 0), - _mm256_extracti128_si256(red_alpha, 1)); - - // shift each component to their correct position in R10G10B10XX - red = _mm_slli_epi32(red, 22); - green = _mm_slli_epi32(green, 12); - blue = _mm_slli_epi32(blue, 2); - - // combine the components - __m128i result = _mm_add_epi32(_mm_add_epi32(red, green), blue); - - // store all four pixels at once - _mm_store_si128(reinterpret_cast<__m128i*>(&dest[x]), result); + frame.image_data(0).data() + ((long long)y * decklink_format_desc.width + x * 48) * 8); + + + // Load pixels + const __m256i* pixeldata = reinterpret_cast(src); + + __m256i luma[6]; + __m256i chroma[6]; + + for (int i = 0; i < 6; i++) { + __m256i p0123 = _mm256_load_si256(pixeldata + i * 2); + __m256i p4567 = _mm256_load_si256(pixeldata + i * 2 + 1); + + // shift down to 10 bit precision + p0123 = _mm256_srli_epi16(p0123, 6); + p4567 = _mm256_srli_epi16(p4567, 6); + + // unpack 16 bit values to 32 bit registers, padding with zeros + __m256i pixel_pairs[4]; + pixel_pairs[0] = _mm256_unpacklo_epi16(p0123, zero); // pixels 0 2 + pixel_pairs[1] = _mm256_unpackhi_epi16(p0123, zero); // pixels 1 3 + pixel_pairs[2] = _mm256_unpacklo_epi16(p4567, zero); // pixels 4 6 + pixel_pairs[3] = _mm256_unpackhi_epi16(p4567, zero); // pixels 5 7 + + /* COMPUTE LUMA */ + { + // Multiply by y-coefficients + __m256i y4[4]; + for (int i = 0; i < 4; i++) { + y4[i] = _mm256_mullo_epi32(pixel_pairs[i], y_coeff); + } + + // sum products + __m256i y2_sum0123 = _mm256_hadd_epi32(y4[0], y4[1]); + __m256i y2_sum4567 = _mm256_hadd_epi32(y4[2], y4[3]); + __m256i y_sum01452367 = _mm256_hadd_epi32(y2_sum0123, y2_sum4567); + luma[i] = _mm256_srli_epi32(_mm256_add_epi32(y_sum01452367, y_offset), + 20); // add offset and shift down to 10 bit precision + } + + /* COMPUTE CHROMA */ + { + // Multiply by cb-coefficients + __m256i cbcr4[4]; // 0 = cb02, 1 = cr02, 2 = cb46, 3 = cr46 + for (int i = 0; i < 2; i++) { + cbcr4[i * 2] = _mm256_mullo_epi32(pixel_pairs[i * 2], cb_coeff); + cbcr4[i * 2 + 1] = _mm256_mullo_epi32(pixel_pairs[i * 2], cr_coeff); + } + + // sum products + __m256i cb_sum0426 = _mm256_hadd_epi32(cbcr4[0], cbcr4[2]); + __m256i cr_sum0426 = _mm256_hadd_epi32(cbcr4[1], cbcr4[3]); + __m256i cbcr_sum_0426 = _mm256_hadd_epi32(cb_sum0426, cr_sum0426); + chroma[i] = _mm256_srli_epi32(_mm256_add_epi32(cbcr_sum_0426, c_offset), + 20); // add offset and shift down to 10 bit precision + } + } + + /*-- pack v210 --*/ + + // luma layout = y0 y1 y4 y5 y2 y3 y6 y7 + // chroma layout = cb0 cr0 cb4 cr4 cb2 cr2 cb6 cr6 + + __m256i luma_16bit[3]; + __m256i chroma_16bit[3]; + __m256i offsets = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);// (0, 4, 1, 5, 2, 6, 3, 7); + for (int i = 0; i < 3; i++) { + auto y16 = + _mm256_packus_epi32(luma[i * 2], luma[i * 2 + 1]); // layout 0 1 4 5 8 9 12 13 2 + // 3 6 7 10 11 14 15 + auto cbcr16 = _mm256_packus_epi32(chroma[i * 2], + chroma[i * 2 + 1]); // cbcr0 cbcr4 cbcr8 cbcr12 + // cbcr2 cbcr6 cbcr10 cbcr14 + luma_16bit[i] = _mm256_permutevar8x32_epi32( + y16, + offsets); // layout 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + chroma_16bit[i] = _mm256_permutevar8x32_epi32( + cbcr16, + offsets); // cbcr0 cbcr2 cbcr4 cbcr6 cbcr8 cbcr10 cbcr12 cbcr14 + } + + __m128i chroma_mult = _mm_set_epi16(0, 0, 4, 16, 1, 4, 16, 1); + __m128i chroma_shuf = _mm_set_epi8(-1, 11, 10, -1, 9, 8, 7, 6, -1, 5, 4, -1, 3, 2, 1, 0); + + uint16_t* luma_ptr = reinterpret_cast(luma_16bit); + uint16_t* chroma_ptr = reinterpret_cast(chroma_16bit); + for (int i = 0; i < 8; ++i) { + __m128i luma = _mm_loadu_si128(reinterpret_cast<__m128i*>(luma_ptr)); + __m128i chroma = _mm_loadu_si128(reinterpret_cast<__m128i*>(chroma_ptr)); + __m128i luma_packed = _mm_mullo_epi16(luma, luma_mult); + __m128i chroma_packed = _mm_mullo_epi16(chroma, chroma_mult); + + luma_packed = _mm_shuffle_epi8(luma_packed, luma_shuf); + chroma_packed = _mm_shuffle_epi8(chroma_packed, chroma_shuf); + + auto res = _mm_or_si128(luma_packed, chroma_packed); + _mm_store_si128(v210_dest++, res); + + luma_ptr += 6; + chroma_ptr += 6; + } } } }); From 712024e613c052c5a99f5b0bf407b9b72d10b2cf Mon Sep 17 00:00:00 2001 From: Niklas Andersson <3985238+niklaspandersson@users.noreply.github.com> Date: Mon, 9 Sep 2024 10:02:46 +0200 Subject: [PATCH 6/6] [fixup v210] colors --- src/modules/decklink/consumer/frame.cpp | 63 ++++++++++++------------- 1 file changed, 30 insertions(+), 33 deletions(-) diff --git a/src/modules/decklink/consumer/frame.cpp b/src/modules/decklink/consumer/frame.cpp index 3111c03dd0..5134689d2a 100644 --- a/src/modules/decklink/consumer/frame.cpp +++ b/src/modules/decklink/consumer/frame.cpp @@ -116,44 +116,36 @@ void convert_frame(const core::video_format_desc& channel_format_desc, auto rows_per_thread = decklink_format_desc.height / NUM_THREADS; size_t byte_count_line = get_row_bytes(bmdFormat10BitYUV, decklink_format_desc.width); int fullspeed_x = decklink_format_desc.width / 48; - tbb::parallel_for(0, NUM_THREADS, [&](int i) { - auto end = (i + 1) * rows_per_thread; - // __m128i luma_mult = _mm_set_epi16(4, 1, 16, 4, 1, 16, 0, 0); - // __m128i luma_shuf = _mm_set_epi8(-1, 0, 1, -1, 2, 3, 4, 5, -1, 6, 7, -1, 8, 9, 10, 11); - __m128i luma_mult = _mm_set_epi16(0, 0, 16, 1, 4, 16, 1, 4); - __m128i luma_shuf = _mm_set_epi8(11, 10, 9, 8, -1, 7, 6, -1, 5, 4, 3, 2, -1, 1, 0, -1); - - // __m128i chroma_mult = _mm_set_epi16(1, 16, 4, 1, 16, 4, 0, 0); - // __m128i chroma_shuf = _mm_set_epi8(0, 1, 2, 3, -1, 4, 5, -1, 6, 7, 8, 9, -1, 10, 11, -1); - + tbb::parallel_for(0, NUM_THREADS, [&](int thread_index) { + auto end = (thread_index + 1) * rows_per_thread; __m256i zero = _mm256_setzero_si256(); __m256i y_offset = _mm256_set1_epi32(64 << 20); - __m256i c_offset = _mm256_set1_epi32(512 << 20); - __m128i yc_ctmp = _mm_set_epi32(0, color_matrix[2], color_matrix[1], color_matrix[0]); - __m128i cb_ctmp = _mm_set_epi32(0, color_matrix[5], color_matrix[4], color_matrix[3]); - __m128i cr_ctmp = _mm_set_epi32(0, color_matrix[8], color_matrix[7], color_matrix[6]); + __m256i c_offset = _mm256_set1_epi32((1025) << 19); + __m128i yc_ctmp = _mm_set_epi32(0, color_matrix[2], color_matrix[1], color_matrix[0]); + __m128i cb_ctmp = _mm_set_epi32(0, color_matrix[5], color_matrix[4], color_matrix[3]); + __m128i cr_ctmp = _mm_set_epi32(0, color_matrix[8], color_matrix[7], color_matrix[6]); __m256i y_coeff = _mm256_set_m128i(yc_ctmp, yc_ctmp); __m256i cb_coeff = _mm256_set_m128i(cb_ctmp, cb_ctmp); __m256i cr_coeff = _mm256_set_m128i(cr_ctmp, cr_ctmp); - for (int y = firstLine + i * rows_per_thread; y < end; y += decklink_format_desc.field_count) { - auto dest = reinterpret_cast(image_data.get()) + (long long)y * byte_count_line / 4; + for (int y = firstLine + thread_index * rows_per_thread; y < end; + y += decklink_format_desc.field_count) { + auto dest = reinterpret_cast(image_data.get()) + (long long)y * byte_count_line / 4; __m128i* v210_dest = reinterpret_cast<__m128i*>(dest); for (int x = 0; x < fullspeed_x; x++) { auto src = reinterpret_cast( frame.image_data(0).data() + ((long long)y * decklink_format_desc.width + x * 48) * 8); - // Load pixels const __m256i* pixeldata = reinterpret_cast(src); __m256i luma[6]; __m256i chroma[6]; - for (int i = 0; i < 6; i++) { - __m256i p0123 = _mm256_load_si256(pixeldata + i * 2); - __m256i p4567 = _mm256_load_si256(pixeldata + i * 2 + 1); + for (int batch_index = 0; batch_index < 6; batch_index++) { + __m256i p0123 = _mm256_load_si256(pixeldata + batch_index * 2); + __m256i p4567 = _mm256_load_si256(pixeldata + batch_index * 2 + 1); // shift down to 10 bit precision p0123 = _mm256_srli_epi16(p0123, 6); @@ -178,8 +170,9 @@ void convert_frame(const core::video_format_desc& channel_format_desc, __m256i y2_sum0123 = _mm256_hadd_epi32(y4[0], y4[1]); __m256i y2_sum4567 = _mm256_hadd_epi32(y4[2], y4[3]); __m256i y_sum01452367 = _mm256_hadd_epi32(y2_sum0123, y2_sum4567); - luma[i] = _mm256_srli_epi32(_mm256_add_epi32(y_sum01452367, y_offset), - 20); // add offset and shift down to 10 bit precision + luma[batch_index] = + _mm256_srli_epi32(_mm256_add_epi32(y_sum01452367, y_offset), + 20); // add offset and shift down to 10 bit precision } /* COMPUTE CHROMA */ @@ -192,11 +185,12 @@ void convert_frame(const core::video_format_desc& channel_format_desc, } // sum products - __m256i cb_sum0426 = _mm256_hadd_epi32(cbcr4[0], cbcr4[2]); - __m256i cr_sum0426 = _mm256_hadd_epi32(cbcr4[1], cbcr4[3]); - __m256i cbcr_sum_0426 = _mm256_hadd_epi32(cb_sum0426, cr_sum0426); - chroma[i] = _mm256_srli_epi32(_mm256_add_epi32(cbcr_sum_0426, c_offset), - 20); // add offset and shift down to 10 bit precision + __m256i cbcr_sum02 = _mm256_hadd_epi32(cbcr4[1], cbcr4[0]); + __m256i cbcr_sum46 = _mm256_hadd_epi32(cbcr4[3], cbcr4[2]); + __m256i cbcr_sum_0426 = _mm256_hadd_epi32(cbcr_sum02, cbcr_sum46); + chroma[batch_index] = + _mm256_srli_epi32(_mm256_add_epi32(cbcr_sum_0426, c_offset), + 20); // add offset and shift down to 10 bit precision } } @@ -207,14 +201,14 @@ void convert_frame(const core::video_format_desc& channel_format_desc, __m256i luma_16bit[3]; __m256i chroma_16bit[3]; - __m256i offsets = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);// (0, 4, 1, 5, 2, 6, 3, 7); + __m256i offsets = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0); // (0, 4, 1, 5, 2, 6, 3, 7); for (int i = 0; i < 3; i++) { auto y16 = _mm256_packus_epi32(luma[i * 2], luma[i * 2 + 1]); // layout 0 1 4 5 8 9 12 13 2 // 3 6 7 10 11 14 15 auto cbcr16 = _mm256_packus_epi32(chroma[i * 2], - chroma[i * 2 + 1]); // cbcr0 cbcr4 cbcr8 cbcr12 - // cbcr2 cbcr6 cbcr10 cbcr14 + chroma[i * 2 + 1]); // cbcr0 cbcr4 cbcr8 cbcr12 + // cbcr2 cbcr6 cbcr10 cbcr14 luma_16bit[i] = _mm256_permutevar8x32_epi32( y16, offsets); // layout 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 @@ -226,11 +220,14 @@ void convert_frame(const core::video_format_desc& channel_format_desc, __m128i chroma_mult = _mm_set_epi16(0, 0, 4, 16, 1, 4, 16, 1); __m128i chroma_shuf = _mm_set_epi8(-1, 11, 10, -1, 9, 8, 7, 6, -1, 5, 4, -1, 3, 2, 1, 0); + __m128i luma_mult = _mm_set_epi16(0, 0, 16, 1, 4, 16, 1, 4); + __m128i luma_shuf = _mm_set_epi8(11, 10, 9, 8, -1, 7, 6, -1, 5, 4, 3, 2, -1, 1, 0, -1); + uint16_t* luma_ptr = reinterpret_cast(luma_16bit); uint16_t* chroma_ptr = reinterpret_cast(chroma_16bit); for (int i = 0; i < 8; ++i) { - __m128i luma = _mm_loadu_si128(reinterpret_cast<__m128i*>(luma_ptr)); - __m128i chroma = _mm_loadu_si128(reinterpret_cast<__m128i*>(chroma_ptr)); + __m128i luma = _mm_loadu_si128(reinterpret_cast<__m128i*>(luma_ptr)); + __m128i chroma = _mm_loadu_si128(reinterpret_cast<__m128i*>(chroma_ptr)); __m128i luma_packed = _mm_mullo_epi16(luma, luma_mult); __m128i chroma_packed = _mm_mullo_epi16(chroma, chroma_mult); @@ -333,7 +330,7 @@ std::shared_ptr convert_frame_for_port(const core::video_format_desc& chan bool hdr) { std::shared_ptr image_data = - allocate_frame_data(decklink_format_desc, hdr ? bmdFormat10BitRGBXLE : bmdFormat8BitBGRA); + allocate_frame_data(decklink_format_desc, hdr ? bmdFormat10BitYUV : bmdFormat8BitBGRA); if (field_dominance != bmdProgressiveFrame) { convert_frame(channel_format_desc,