diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000..412eeda
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,22 @@
+# Auto detect text files and perform LF normalization
+* text=auto
+
+# Custom for Visual Studio
+*.cs     diff=csharp
+*.sln    merge=union
+*.csproj merge=union
+*.vbproj merge=union
+*.fsproj merge=union
+*.dbproj merge=union
+
+# Standard to msysgit
+*.doc	 diff=astextplain
+*.DOC	 diff=astextplain
+*.docx diff=astextplain
+*.DOCX diff=astextplain
+*.dot  diff=astextplain
+*.DOT  diff=astextplain
+*.pdf  diff=astextplain
+*.PDF	 diff=astextplain
+*.rtf	 diff=astextplain
+*.RTF	 diff=astextplain
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..7cdee98
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,163 @@
+# Windows image file caches
+Thumbs.db
+ehthumbs.db
+
+# Folder config file
+Desktop.ini
+
+# Recycle Bin used on file shares
+$RECYCLE.BIN/
+
+# Windows Installer files
+*.cab
+*.msi
+*.msm
+*.msp
+
+# =========================
+# Operating System Files
+# =========================
+
+# OSX
+# =========================
+
+.DS_Store
+.AppleDouble
+.LSOverride
+
+# Icon must end with two \r
+Icon
+
+# Thumbnails
+._*
+
+# Files that might appear on external disk
+.Spotlight-V100
+.Trashes
+
+# Directories potentially created on remote AFP share
+.AppleDB
+.AppleDesktop
+Network Trash Folder
+Temporary Items
+.apdisk
+
+# Object files
+*.o
+*.ko
+*.obj
+*.elf
+
+# Precompiled Headers
+*.gch
+*.pch
+
+# Libraries
+*.lib
+*.a
+*.la
+*.lo
+
+# Shared objects (inc. Windows DLLs)
+*.dll
+*.so
+*.so.*
+*.dylib
+
+# Executables
+*.exe
+*.out
+*.app
+*.i*86
+*.x86_64
+*.hex
+
+## Ignore Visual Studio temporary files, build results, and
+## files generated by popular Visual Studio add-ons.
+
+# User-specific files
+*.suo
+*.user
+*.userosscache
+*.sln.docstates
+
+# User-specific files (MonoDevelop/Xamarin Studio)
+*.userprefs
+
+# Build results
+[Dd]ebug/
+[Dd]ebugPublic/
+[Rr]elease/
+!unix_makefiles/Release/
+[Rr]eleases/
+x64/
+x86/
+build/
+bld/
+[Bb]in/
+[Oo]bj/
+
+# Visual Studo 2015 cache/options directory
+.vs/
+
+# MSTest test Results
+[Tt]est[Rr]esult*/
+[Bb]uild[Ll]og.*
+
+#NUNIT
+*.VisualState.xml
+TestResult.xml
+
+# Build Results of an ATL Project
+[Dd]ebugPS/
+[Rr]eleasePS/
+dlldata.c
+
+*_i.c
+*_p.c
+*_i.h
+*.ilk
+*.meta
+*.obj
+*.pch
+*.pdb
+*.pgc
+*.pgd
+*.rsp
+*.sbr
+*.tlb
+*.tli
+*.tlh
+*.tmp
+*.tmp_proj
+*.log
+*.vspscc
+*.vssscc
+.builds
+*.pidb
+*.svclog
+*.scc
+
+# Chutzpah Test files
+_Chutzpah*
+
+# Visual C++ cache files
+ipch/
+*.aps
+*.ncb
+*.opensdf
+*.sdf
+*.cachefile
+
+*.VC.db
+
+# Visual Studio profiler
+*.psess
+*.vsp
+*.vspx
+
+#OpenWatcom files
+*.lk1
+*.map
+*.mk
+*.mk1
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..5705a3a
--- /dev/null
+++ b/README.md
@@ -0,0 +1,53 @@
+Репозиторий Fractal Image Compression
+=====================================
+
+Описание
+--------
+
+Простой кодек для фрактального сжатия изображений. Основывается на описании из первой главы книги Yuval Fisher, Fractal Image compression: Theory and Application.
+
+Кодирование реализовано на CPU (compress.exe) и GPU (cuda_compress.exe - cuda 7.5, версии спецификации >=2.0, и cudaold_compress.exe - cuda 6.5, версии спецификации 1.1, 2.0). Имеется версия с поддержкой mpi.
+
+Ссылки
+------
+
+[Репозиторий](https://github.com/ImageProcessing-ElectronicPublications/fractal_image_compression)
+
+[Скачать](https://github.com/ImageProcessing-ElectronicPublications/fractal_image_compression/releases)
+
+Для запуска под Windows необходим Windows XP и выше и Visual Studio 2013 runtime; для использования с mpi необходим mpich2 v1.4.1p1.
+
+Использование
+-------------
+
+Как сжимать:
+
+>compress.exe input.tga output.fi 32 4 5
+
+где 32 - размер рангового блока (ширина и высота изображения должны делиться на 32; чем больше размер рангового блока, тем лучше сжатие), 4 - количество потоков ЦПУ, 5 - максимальное значение погрешности для пикселя (чем больше число, тем меньше файл, но хуже качество получившегося изображения).
+
+Как разжимать:
+
+>decompress.exe output.fi decompressed.tga
+
+или
+
+>decompress.exe output.fi decompressed.tga 3
+
+если вы хотите увеличить ширину и высоту изображения в 3 раза (можно использовать любое целое число).
+
+Сборка
+------
+
+Под Windows просто используйте Visual Studio 2013 и выше.
+
+Под unix просто войдите в unix_makefiles и введите в консоли:
+
+>make all
+
+Для сборки под юниксами также необходим gcc и mpi (тестировал под Debian 7 с mpich2 версии 1.4.1p1).
+
+Лицензия
+--------
+
+Нет её). Общественное достояние.
diff --git a/chihiro2.tga b/chihiro2.tga
new file mode 100644
index 0000000..2a6e850
Binary files /dev/null and b/chihiro2.tga differ
diff --git a/compress/compress.vcxproj b/compress/compress.vcxproj
new file mode 100644
index 0000000..5922b13
--- /dev/null
+++ b/compress/compress.vcxproj
@@ -0,0 +1,128 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release (MPI)|Win32">
+      <Configuration>Release (MPI)</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{65DA45D0-C86F-4A4A-8169-2AA90B68AA47}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>compress</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v120_xp</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v120_xp</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release (MPI)|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v120_xp</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release (MPI)|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release (MPI)|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+    <IncludePath>$(ProgramFiles)\MPICH2\include;$(VC_IncludePath);$(WindowsSDK_IncludePath)</IncludePath>
+    <LibraryPath>$(ProgramFiles)\MPICH2\lib;$(VC_LibraryPath_x86);$(WindowsSDK_LibraryPath_x86);</LibraryPath>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release (MPI)|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>I_USE_MPI;WIN32;NDEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <AdditionalDependencies>mpi.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="..\src\arrays.c" />
+    <ClCompile Include="..\src\compress.c" />
+    <ClCompile Include="..\src\cpu_driven\fi_save_blocks_search_workers.c" />
+    <ClCompile Include="..\src\fi_save.c" />
+    <ClCompile Include="..\src\fi_save_blocks_compression.c" />
+    <ClCompile Include="..\src\fi_save_blocks_search.c" />
+    <ClCompile Include="..\src\image_misc.c" />
+    <ClCompile Include="..\src\tga_load.c" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
\ No newline at end of file
diff --git a/compress/compress.vcxproj.filters b/compress/compress.vcxproj.filters
new file mode 100644
index 0000000..55e1b33
--- /dev/null
+++ b/compress/compress.vcxproj.filters
@@ -0,0 +1,46 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <Filter Include="Source Files">
+      <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
+      <Extensions>cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
+    </Filter>
+    <Filter Include="Header Files">
+      <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
+      <Extensions>h;hh;hpp;hxx;hm;inl;inc;xsd</Extensions>
+    </Filter>
+    <Filter Include="Resource Files">
+      <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
+      <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
+    </Filter>
+    <Filter Include="Source Files\CPU driven">
+      <UniqueIdentifier>{9856afa4-a7d8-48a4-8188-a60d310d20fc}</UniqueIdentifier>
+    </Filter>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="..\src\compress.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\fi_save.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\tga_load.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\image_misc.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\arrays.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\fi_save_blocks_compression.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\fi_save_blocks_search.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\cpu_driven\fi_save_blocks_search_workers.c">
+      <Filter>Source Files\CPU driven</Filter>
+    </ClCompile>
+  </ItemGroup>
+</Project>
\ No newline at end of file
diff --git a/cuda_compress/cuda_compress.vcxproj b/cuda_compress/cuda_compress.vcxproj
new file mode 100644
index 0000000..da8a8af
--- /dev/null
+++ b/cuda_compress/cuda_compress.vcxproj
@@ -0,0 +1,256 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release (MPI)|Win32">
+      <Configuration>Release (MPI)</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release (MPI)|x64">
+      <Configuration>Release (MPI)</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="..\src\arrays.c" />
+    <ClCompile Include="..\src\compress.c" />
+    <ClCompile Include="..\src\fi_save.c" />
+    <ClCompile Include="..\src\fi_save_blocks_compression.c" />
+    <ClCompile Include="..\src\fi_save_blocks_search.c" />
+    <ClCompile Include="..\src\image_misc.c" />
+    <ClCompile Include="..\src\tga_load.c" />
+  </ItemGroup>
+  <ItemGroup>
+    <CudaCompile Include="..\src\gpu_driven\fi_save_blocks_search_workers.cu" />
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{AA1B258B-9E5E-478B-9D70-1B6EA5F7E8E9}</ProjectGuid>
+    <RootNamespace>cuda_compress</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v120</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v120</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v120</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release (MPI)|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v120</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v120</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release (MPI)|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v120</PlatformToolset>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 7.5.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release (MPI)|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release (MPI)|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release (MPI)|Win32'">
+    <IncludePath>$(ProgramFiles)\MPICH2\include;$(VC_IncludePath);$(WindowsSDK_IncludePath);</IncludePath>
+    <LibraryPath>$(ProgramFiles)\MPICH2\lib;$(VC_LibraryPath_x86);$(WindowsSDK_LibraryPath_x86);</LibraryPath>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>SINGLE_THREAD_BY_DEFAULT;WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <MinimumRequiredVersion>5.01</MinimumRequiredVersion>
+    </Link>
+    <PostBuildEvent>
+      <Command>echo copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"
+copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
+    </PostBuildEvent>
+    <CudaCompile>
+      <CodeGeneration>compute_20,sm_20;compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;</CodeGeneration>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;WIN64;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+    <PostBuildEvent>
+      <Command>echo copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"
+copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
+    </PostBuildEvent>
+    <CudaCompile>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>SINGLE_THREAD_BY_DEFAULT;WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <MinimumRequiredVersion>5.01</MinimumRequiredVersion>
+    </Link>
+    <PostBuildEvent>
+      <Command>echo copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"
+copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
+    </PostBuildEvent>
+    <CudaCompile>
+      <CodeGeneration>compute_20,sm_20;compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;</CodeGeneration>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release (MPI)|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>SINGLE_THREAD_BY_DEFAULT;I_USE_MPI;WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>mpi.lib;cudart.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <MinimumRequiredVersion>5.01</MinimumRequiredVersion>
+    </Link>
+    <PostBuildEvent>
+      <Command>echo copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"
+copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
+    </PostBuildEvent>
+    <CudaCompile>
+      <CodeGeneration>compute_20,sm_20;compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;</CodeGeneration>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;WIN64;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+    <PostBuildEvent>
+      <Command>echo copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"
+copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
+    </PostBuildEvent>
+    <CudaCompile>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release (MPI)|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;WIN64;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+    <PostBuildEvent>
+      <Command>echo copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"
+copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
+    </PostBuildEvent>
+    <CudaCompile>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 7.5.targets" />
+  </ImportGroup>
+</Project>
\ No newline at end of file
diff --git a/cuda_compress/cuda_compress.vcxproj.filters b/cuda_compress/cuda_compress.vcxproj.filters
new file mode 100644
index 0000000..6e3e19a
--- /dev/null
+++ b/cuda_compress/cuda_compress.vcxproj.filters
@@ -0,0 +1,25 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <Filter Include="CPU driven">
+      <UniqueIdentifier>{e0e0e71a-6e32-4e79-b5c7-afd5b1a6e698}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="GPU driven">
+      <UniqueIdentifier>{04d7c5b1-6017-4c20-9d85-3b4efa11ce02}</UniqueIdentifier>
+    </Filter>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="..\src\fi_save_blocks_search.c" />
+    <ClCompile Include="..\src\arrays.c" />
+    <ClCompile Include="..\src\compress.c" />
+    <ClCompile Include="..\src\fi_save.c" />
+    <ClCompile Include="..\src\fi_save_blocks_compression.c" />
+    <ClCompile Include="..\src\image_misc.c" />
+    <ClCompile Include="..\src\tga_load.c" />
+  </ItemGroup>
+  <ItemGroup>
+    <CudaCompile Include="..\src\gpu_driven\fi_save_blocks_search_workers.cu">
+      <Filter>GPU driven</Filter>
+    </CudaCompile>
+  </ItemGroup>
+</Project>
\ No newline at end of file
diff --git a/cudaold_compress/cudaold_compress.vcxproj b/cudaold_compress/cudaold_compress.vcxproj
new file mode 100644
index 0000000..6194b84
--- /dev/null
+++ b/cudaold_compress/cudaold_compress.vcxproj
@@ -0,0 +1,256 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release (MPI)|Win32">
+      <Configuration>Release (MPI)</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release (MPI)|x64">
+      <Configuration>Release (MPI)</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <ItemGroup>
+    <CudaCompile Include="..\src\gpu_driven\fi_save_blocks_search_workers.cu" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="..\src\arrays.c" />
+    <ClCompile Include="..\src\compress.c" />
+    <ClCompile Include="..\src\fi_save.c" />
+    <ClCompile Include="..\src\fi_save_blocks_compression.c" />
+    <ClCompile Include="..\src\fi_save_blocks_search.c" />
+    <ClCompile Include="..\src\image_misc.c" />
+    <ClCompile Include="..\src\tga_load.c" />
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{D38F1866-2265-48A0-AD93-3A5E05C69163}</ProjectGuid>
+    <RootNamespace>cudaold_compress</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v120</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v120</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v120</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release (MPI)|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v120</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v120</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release (MPI)|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v120</PlatformToolset>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 6.5.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release (MPI)|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release (MPI)|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release (MPI)|Win32'">
+    <IncludePath>$(ProgramFiles)\MPICH2\include;$(VC_IncludePath);$(WindowsSDK_IncludePath);</IncludePath>
+    <LibraryPath>$(ProgramFiles)\MPICH2\lib;$(VC_LibraryPath_x86);$(WindowsSDK_LibraryPath_x86);</LibraryPath>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>SINGLE_THREAD_BY_DEFAULT;WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <MinimumRequiredVersion>5.01</MinimumRequiredVersion>
+    </Link>
+    <PostBuildEvent>
+      <Command>echo copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"
+copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
+    </PostBuildEvent>
+    <CudaCompile>
+      <CodeGeneration>compute_12,sm_12;compute_20,sm_20;</CodeGeneration>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;WIN64;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+    <PostBuildEvent>
+      <Command>echo copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"
+copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
+    </PostBuildEvent>
+    <CudaCompile>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>SINGLE_THREAD_BY_DEFAULT;WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <MinimumRequiredVersion>5.01</MinimumRequiredVersion>
+    </Link>
+    <PostBuildEvent>
+      <Command>echo copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"
+copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
+    </PostBuildEvent>
+    <CudaCompile>
+      <CodeGeneration>compute_12,sm_12;compute_20,sm_20;</CodeGeneration>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release (MPI)|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>SINGLE_THREAD_BY_DEFAULT;I_USE_MPI;WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>mpi.lib;cudart.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <MinimumRequiredVersion>5.01</MinimumRequiredVersion>
+    </Link>
+    <PostBuildEvent>
+      <Command>echo copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"
+copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
+    </PostBuildEvent>
+    <CudaCompile>
+      <CodeGeneration>compute_12,sm_12;compute_20,sm_20;</CodeGeneration>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;WIN64;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+    <PostBuildEvent>
+      <Command>echo copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"
+copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
+    </PostBuildEvent>
+    <CudaCompile>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release (MPI)|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;WIN64;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+    <PostBuildEvent>
+      <Command>echo copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"
+copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
+    </PostBuildEvent>
+    <CudaCompile>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 6.5.targets" />
+  </ImportGroup>
+</Project>
\ No newline at end of file
diff --git a/cudaold_compress/cudaold_compress.vcxproj.filters b/cudaold_compress/cudaold_compress.vcxproj.filters
new file mode 100644
index 0000000..ec4a007
--- /dev/null
+++ b/cudaold_compress/cudaold_compress.vcxproj.filters
@@ -0,0 +1,22 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <Filter Include="GPU driven">
+      <UniqueIdentifier>{3e52e746-98ea-452d-89aa-d1f6432b29fa}</UniqueIdentifier>
+    </Filter>
+  </ItemGroup>
+  <ItemGroup>
+    <CudaCompile Include="..\src\gpu_driven\fi_save_blocks_search_workers.cu">
+      <Filter>GPU driven</Filter>
+    </CudaCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="..\src\arrays.c" />
+    <ClCompile Include="..\src\compress.c" />
+    <ClCompile Include="..\src\fi_save.c" />
+    <ClCompile Include="..\src\fi_save_blocks_compression.c" />
+    <ClCompile Include="..\src\fi_save_blocks_search.c" />
+    <ClCompile Include="..\src\image_misc.c" />
+    <ClCompile Include="..\src\tga_load.c" />
+  </ItemGroup>
+</Project>
\ No newline at end of file
diff --git a/decompress/decompress.vcxproj b/decompress/decompress.vcxproj
new file mode 100644
index 0000000..82a628c
--- /dev/null
+++ b/decompress/decompress.vcxproj
@@ -0,0 +1,88 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{F97F886B-3E32-4790-B091-A4CBF1B91946}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>decompress</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v120_xp</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v120_xp</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="..\src\arrays.c" />
+    <ClCompile Include="..\src\decompress.c" />
+    <ClCompile Include="..\src\fi_load.c" />
+    <ClCompile Include="..\src\image_misc.c" />
+    <ClCompile Include="..\src\tga_save.c" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
\ No newline at end of file
diff --git a/decompress/decompress.vcxproj.filters b/decompress/decompress.vcxproj.filters
new file mode 100644
index 0000000..a72b55f
--- /dev/null
+++ b/decompress/decompress.vcxproj.filters
@@ -0,0 +1,34 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <Filter Include="Source Files">
+      <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
+      <Extensions>cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
+    </Filter>
+    <Filter Include="Header Files">
+      <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
+      <Extensions>h;hh;hpp;hxx;hm;inl;inc;xsd</Extensions>
+    </Filter>
+    <Filter Include="Resource Files">
+      <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
+      <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
+    </Filter>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="..\src\decompress.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\fi_load.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\tga_save.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\image_misc.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\arrays.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+  </ItemGroup>
+</Project>
\ No newline at end of file
diff --git a/fractal_image_compression.sln b/fractal_image_compression.sln
new file mode 100644
index 0000000..116543e
--- /dev/null
+++ b/fractal_image_compression.sln
@@ -0,0 +1,70 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2013
+VisualStudioVersion = 12.0.40629.0
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "compress", "compress\compress.vcxproj", "{65DA45D0-C86F-4A4A-8169-2AA90B68AA47}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "decompress", "decompress\decompress.vcxproj", "{F97F886B-3E32-4790-B091-A4CBF1B91946}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cuda_compress", "cuda_compress\cuda_compress.vcxproj", "{AA1B258B-9E5E-478B-9D70-1B6EA5F7E8E9}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cudaold_compress", "cudaold_compress\cudaold_compress.vcxproj", "{D38F1866-2265-48A0-AD93-3A5E05C69163}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|Win32 = Debug|Win32
+		Debug|x64 = Debug|x64
+		Release (MPI)|Win32 = Release (MPI)|Win32
+		Release (MPI)|x64 = Release (MPI)|x64
+		Release|Win32 = Release|Win32
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{65DA45D0-C86F-4A4A-8169-2AA90B68AA47}.Debug|Win32.ActiveCfg = Debug|Win32
+		{65DA45D0-C86F-4A4A-8169-2AA90B68AA47}.Debug|Win32.Build.0 = Debug|Win32
+		{65DA45D0-C86F-4A4A-8169-2AA90B68AA47}.Debug|x64.ActiveCfg = Debug|Win32
+		{65DA45D0-C86F-4A4A-8169-2AA90B68AA47}.Release (MPI)|Win32.ActiveCfg = Release (MPI)|Win32
+		{65DA45D0-C86F-4A4A-8169-2AA90B68AA47}.Release (MPI)|Win32.Build.0 = Release (MPI)|Win32
+		{65DA45D0-C86F-4A4A-8169-2AA90B68AA47}.Release (MPI)|x64.ActiveCfg = Release (MPI)|Win32
+		{65DA45D0-C86F-4A4A-8169-2AA90B68AA47}.Release|Win32.ActiveCfg = Release|Win32
+		{65DA45D0-C86F-4A4A-8169-2AA90B68AA47}.Release|Win32.Build.0 = Release|Win32
+		{65DA45D0-C86F-4A4A-8169-2AA90B68AA47}.Release|x64.ActiveCfg = Release|Win32
+		{F97F886B-3E32-4790-B091-A4CBF1B91946}.Debug|Win32.ActiveCfg = Debug|Win32
+		{F97F886B-3E32-4790-B091-A4CBF1B91946}.Debug|Win32.Build.0 = Debug|Win32
+		{F97F886B-3E32-4790-B091-A4CBF1B91946}.Debug|x64.ActiveCfg = Debug|Win32
+		{F97F886B-3E32-4790-B091-A4CBF1B91946}.Release (MPI)|Win32.ActiveCfg = Release|Win32
+		{F97F886B-3E32-4790-B091-A4CBF1B91946}.Release (MPI)|Win32.Build.0 = Release|Win32
+		{F97F886B-3E32-4790-B091-A4CBF1B91946}.Release (MPI)|x64.ActiveCfg = Release|Win32
+		{F97F886B-3E32-4790-B091-A4CBF1B91946}.Release|Win32.ActiveCfg = Release|Win32
+		{F97F886B-3E32-4790-B091-A4CBF1B91946}.Release|Win32.Build.0 = Release|Win32
+		{F97F886B-3E32-4790-B091-A4CBF1B91946}.Release|x64.ActiveCfg = Release|Win32
+		{AA1B258B-9E5E-478B-9D70-1B6EA5F7E8E9}.Debug|Win32.ActiveCfg = Debug|Win32
+		{AA1B258B-9E5E-478B-9D70-1B6EA5F7E8E9}.Debug|Win32.Build.0 = Debug|Win32
+		{AA1B258B-9E5E-478B-9D70-1B6EA5F7E8E9}.Debug|x64.ActiveCfg = Debug|x64
+		{AA1B258B-9E5E-478B-9D70-1B6EA5F7E8E9}.Debug|x64.Build.0 = Debug|x64
+		{AA1B258B-9E5E-478B-9D70-1B6EA5F7E8E9}.Release (MPI)|Win32.ActiveCfg = Release (MPI)|Win32
+		{AA1B258B-9E5E-478B-9D70-1B6EA5F7E8E9}.Release (MPI)|Win32.Build.0 = Release (MPI)|Win32
+		{AA1B258B-9E5E-478B-9D70-1B6EA5F7E8E9}.Release (MPI)|x64.ActiveCfg = Release|x64
+		{AA1B258B-9E5E-478B-9D70-1B6EA5F7E8E9}.Release (MPI)|x64.Build.0 = Release|x64
+		{AA1B258B-9E5E-478B-9D70-1B6EA5F7E8E9}.Release|Win32.ActiveCfg = Release|Win32
+		{AA1B258B-9E5E-478B-9D70-1B6EA5F7E8E9}.Release|Win32.Build.0 = Release|Win32
+		{AA1B258B-9E5E-478B-9D70-1B6EA5F7E8E9}.Release|x64.ActiveCfg = Release|x64
+		{AA1B258B-9E5E-478B-9D70-1B6EA5F7E8E9}.Release|x64.Build.0 = Release|x64
+		{D38F1866-2265-48A0-AD93-3A5E05C69163}.Debug|Win32.ActiveCfg = Debug|Win32
+		{D38F1866-2265-48A0-AD93-3A5E05C69163}.Debug|Win32.Build.0 = Debug|Win32
+		{D38F1866-2265-48A0-AD93-3A5E05C69163}.Debug|x64.ActiveCfg = Debug|x64
+		{D38F1866-2265-48A0-AD93-3A5E05C69163}.Debug|x64.Build.0 = Debug|x64
+		{D38F1866-2265-48A0-AD93-3A5E05C69163}.Release (MPI)|Win32.ActiveCfg = Release (MPI)|Win32
+		{D38F1866-2265-48A0-AD93-3A5E05C69163}.Release (MPI)|Win32.Build.0 = Release (MPI)|Win32
+		{D38F1866-2265-48A0-AD93-3A5E05C69163}.Release (MPI)|x64.ActiveCfg = Release|x64
+		{D38F1866-2265-48A0-AD93-3A5E05C69163}.Release (MPI)|x64.Build.0 = Release|x64
+		{D38F1866-2265-48A0-AD93-3A5E05C69163}.Release|Win32.ActiveCfg = Release|Win32
+		{D38F1866-2265-48A0-AD93-3A5E05C69163}.Release|Win32.Build.0 = Release|Win32
+		{D38F1866-2265-48A0-AD93-3A5E05C69163}.Release|x64.ActiveCfg = Release|x64
+		{D38F1866-2265-48A0-AD93-3A5E05C69163}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/src/arrays.c b/src/arrays.c
new file mode 100644
index 0000000..d7e1f77
--- /dev/null
+++ b/src/arrays.c
@@ -0,0 +1,86 @@
+
+#include "arrays.h"
+
+bool arrayReadBits(array_type *a, unsigned int size, unsigned int *out)
+{
+	unsigned int output, bits_offset, bytes_offset;
+	unsigned char *bp;
+
+	if(size > 32) return false;
+
+	if((a->bytes_offset+(a->bits_offset+size)/8+((((a->bits_offset+size)%8)>0)?1:0)) > a->bytes_size)
+		return false;
+
+	bp = a->bp;
+	bits_offset = a->bits_offset;
+	bytes_offset = a->bytes_offset;
+	output = 0;
+
+	while(size > 0) {
+		unsigned int this_step;
+
+		if(size > (8-bits_offset))
+			this_step = 8-bits_offset;
+		else
+			this_step = size;
+		size -= this_step;
+		bits_offset += this_step;
+
+		output = (output << this_step)+((unsigned int)(*bp) >> (8-bits_offset))%(1<<this_step);
+		
+		if(bits_offset & 8) {
+			bits_offset = 0;
+			bytes_offset++;
+			bp++;
+		}
+	}
+
+	*out = output;
+
+	a->bp = bp;
+	a->bits_offset = bits_offset;
+	a->bytes_offset = bytes_offset;
+
+	return true;
+}
+
+bool arrayWriteBits(array_type *a, unsigned int size, unsigned int out)
+{
+	unsigned int bits_offset, bytes_offset;
+	unsigned char *bp;
+
+	if(size > 32) return false;
+
+	if((a->bytes_offset+(a->bits_offset+size)/8+((((a->bits_offset+size)%8)>0)?1:0)) > a->bytes_size)
+		return false;
+
+	bp = a->bp;
+	bits_offset = a->bits_offset;
+	bytes_offset = a->bytes_offset;
+
+	while(size > 0) {
+		unsigned int this_step;
+
+		if(size > (8-bits_offset))
+			this_step = 8-bits_offset;
+		else
+			this_step = size;
+		size -= this_step;
+		bits_offset += this_step;
+
+		*bp += ( (out>>size)%(1<<this_step) ) << (8-bits_offset);
+		//output = (output << this_step)+((unsigned int)(*bp) >> (8-bits_offset))%(1<<this_step);
+		
+		if(bits_offset & 8) {
+			bits_offset = 0;
+			bytes_offset++;
+			bp++;
+		}
+	}
+
+	a->bp = bp;
+	a->bits_offset = bits_offset;
+	a->bytes_offset = bytes_offset;
+
+	return true;
+}
\ No newline at end of file
diff --git a/src/arrays.h b/src/arrays.h
new file mode 100644
index 0000000..cf4496c
--- /dev/null
+++ b/src/arrays.h
@@ -0,0 +1,18 @@
+
+#ifndef _ARRAYS_H
+#define _ARRAYS_H
+
+#include <stdbool.h>
+
+typedef struct {
+	unsigned char *bytes;
+	unsigned char *bp;
+	unsigned int bytes_offset;
+	unsigned int bits_offset;
+	unsigned int bytes_size;
+} array_type;
+
+extern bool arrayReadBits(array_type *a, unsigned int size, unsigned int *out);
+extern bool arrayWriteBits(array_type *a, unsigned int size, unsigned int out);
+
+#endif
diff --git a/src/compress.c b/src/compress.c
new file mode 100644
index 0000000..174c0b2
--- /dev/null
+++ b/src/compress.c
@@ -0,0 +1,154 @@
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <ctype.h>
+
+#ifdef I_USE_MPI
+#include <mpi.h>
+#define RETURN_FROM_MAIN(v) {MPI_Abort(MPI_COMM_WORLD,v);return(v);}
+#else
+#define RETURN_FROM_MAIN(v) {return(v);}
+#endif
+
+#include "tga_load.h"
+#include "image_misc.h"
+#include "fi_save.h"
+
+int main(int argc, char **argv)
+{
+	char *input_filename, *output_filename;
+	image_pc_type image_pc;
+	image_uc_type image_uc;
+	unsigned int i, blocksize, nof_threads, rms_error, worst_diff;
+	char *program_name;
+#ifdef I_USE_MPI
+	int my_rank, nof_procs;
+#endif
+
+#ifdef I_USE_MPI
+	MPI_Init(0, 0);
+
+	MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &nof_procs);
+
+	printf("Using MPI. Program rank is %d+1/%d\n", my_rank, nof_procs);
+#endif
+
+	program_name = strrchr(argv[0], '\\');
+
+	if(program_name == 0)
+		program_name = strrchr(argv[0], '/');
+
+	if(program_name == 0)
+		program_name = argv[0];
+	else
+		program_name++;
+
+	// Чтение параметров командной строки
+	if(argc < 3) {
+		printf("\t%s input.tga output.fi [block_size=8 [nof_threads=4 [rms_error=256]]]\n\n"
+			"\t\tYou must specify input and output filenames\n", program_name);
+
+		RETURN_FROM_MAIN(0);
+	}
+
+	input_filename = argv[1];
+	output_filename = argv[2];
+	if(argc > 3) {
+		blocksize = abs(atoi(argv[3]));
+		if(blocksize < 2)
+			blocksize = 2;
+		if(blocksize > IMAGE_MAXBLOCKSIZE)
+			blocksize = IMAGE_MAXBLOCKSIZE;
+
+		printf("Default block size changed to %d\n", blocksize);
+	} else
+		blocksize = 8;
+
+	if(argc > 4) {
+		nof_threads = abs(atoi(argv[4]));
+		if(nof_threads == 0) nof_threads = 1;
+	} else {
+#ifdef SINGLE_THREAD_BY_DEFAULT
+		nof_threads = 1;
+#else
+		nof_threads = 4;
+#endif
+	}
+	printf("Number of threads set to %d\n", nof_threads);
+
+	if(argc > 5)
+		rms_error = abs(atoi(argv[5]));
+	else
+		rms_error = 256;
+	worst_diff = rms_error*rms_error*blocksize*blocksize;
+	printf("RMS error %d, worst difference for first block size will be %d\n", rms_error, worst_diff);
+
+	// Чтение исходного изображения
+	switch(tgaLoad(input_filename, &image_pc)) {
+		case TGALOAD_OKAY:
+			break;
+		case TGALOAD_CANTOPENFILE:
+			printf("Can\'t open %s\n", input_filename);
+			RETURN_FROM_MAIN(1);
+		case TGALOAD_DAMAGEDFILE:
+			printf("Can\'t open %s, file is damaged\n", input_filename);
+			RETURN_FROM_MAIN(1);
+		case TGALOAD_UNSUPPORTEDFILETYPE:
+			printf("Can\'t open %s, unsupported file type\n", input_filename);
+			RETURN_FROM_MAIN(1);
+		case TGALOAD_MEMORYALLOCERROR:
+			printf("Can\'t open %s, memory allocation error\n", input_filename);
+			RETURN_FROM_MAIN(1);
+		default:
+			printf("Undefined error while opening %s\n", input_filename);
+			RETURN_FROM_MAIN(1);
+	}
+
+	if((image_pc.w%blocksize) != 0 || (image_pc.h%blocksize) != 0 || image_pc.w < (blocksize*2) || image_pc.h < (blocksize*2)) {
+		printf("Image sizes must be divisions of %d and greater than %d\n", blocksize, blocksize*2);
+		RETURN_FROM_MAIN(2);
+	}
+
+	if(!UnpackChannels(&image_pc, &image_uc)) {
+		printf("Can\'t unpack image channels\n");
+		RETURN_FROM_MAIN(3);
+	}
+
+	free(image_pc.data);
+
+	BGRtoYCBCR(&image_uc);
+
+	// Сохранение результирующего изображения
+	switch(fiSave(output_filename, &image_uc, blocksize, nof_threads, worst_diff)) {
+		case FISAVE_OKAY:
+			break;
+		case FISAVE_CANTOPENFILE:
+			printf("Can\'t save %s\n", output_filename);
+			RETURN_FROM_MAIN(4);
+		case FISAVE_WRONGIMAGESIZE:
+			printf("Can\'t save %s, wrong image size\n", output_filename);
+			RETURN_FROM_MAIN(4);
+		case FISAVE_DAMAGEDIMAGESTRUCT:
+			printf("Can\'t save %s, internal error, damaged image structure\n", output_filename);
+			RETURN_FROM_MAIN(4);
+		default:
+			printf("Undefined error while saving %s\n", output_filename);
+			RETURN_FROM_MAIN(4);
+	}
+
+	for(i = 0; i < image_uc.nof_channels; i++)
+		free(image_uc.data[i]);
+	free(image_uc.data);
+
+#ifdef I_USE_MPI
+	printf("===\n\trank %d finished\n===\n", my_rank);
+
+	MPI_Barrier(MPI_COMM_WORLD);
+	MPI_Finalize();
+#endif
+
+	return 0;
+}
diff --git a/src/cpu_driven/fi_save_blocks_search_workers.c b/src/cpu_driven/fi_save_blocks_search_workers.c
new file mode 100644
index 0000000..7e13028
--- /dev/null
+++ b/src/cpu_driven/fi_save_blocks_search_workers.c
@@ -0,0 +1,204 @@
+
+#include <stdio.h>
+#include <string.h>
+
+#include "../image_misc.h"
+#include "../fi_save_blocks_search_workers.h"
+
+static void fiFindBestDomainBlock(unsigned int rx, unsigned int ry,
+	image_rangeblock_type *block_p, unsigned int blocksize,
+	unsigned int w, unsigned int h,
+	unsigned char *scaled_data_p,
+	unsigned char *data_p,
+	unsigned int worst_diff
+	)
+{
+	unsigned int best_diff;
+	long long range_sum, domain_sum, rd_sum; // Сумма пикселей рангового блока, доменного блока, пикселя рангового на пиксель доменного блока
+	long long domain_disp;
+	unsigned int l, m, s, t, tr;
+	unsigned char rotated_data_p[8*IMAGE_MAXBLOCKSIZE*IMAGE_MAXBLOCKSIZE];
+
+	for(l = 0; l < blocksize; l++) {
+		memcpy(rotated_data_p+blocksize*l, data_p+w*(ry+l)+rx, blocksize);
+	}
+	for(l = 1; l < 8; l++) {
+		ApplyReversedTransformToRangeBlock(rotated_data_p, rotated_data_p+l*blocksize*blocksize, l, blocksize);
+	}
+
+	range_sum = 0;
+	for(l = 0; l < blocksize; l++)
+		for(m = 0; m < blocksize; m++) {
+			range_sum += rotated_data_p[l*blocksize+m];
+		}
+	block_p->u = 0; block_p->v = (short)( ((range_sum/(blocksize*blocksize))*63+128)/255 ); // Значения на случай, если совсем ни один блок не подойдёт
+	best_diff = 0; // Вообще, тут можно поставить любое число > 255*255*blocksize*blocksize=4161600(для 8), но мы найдём разницу для v и u выше
+	for(l = 0; l < blocksize; l++) {
+		for(m = 0; m < blocksize; m++) {
+			int br = rotated_data_p[l*blocksize+m];
+			int mult;
+
+			mult = (block_p->v*255/63-br);
+			best_diff += mult*mult;
+		}
+	}
+	//printf("range_disp %d, range_sum %d\n", range_disp, range_sum);
+
+	for(s = 0; s <= h/2-blocksize; s++) {
+		for(t = 0; t <= w/2-blocksize; t++) {
+			domain_disp = domain_sum = 0;
+			for(l = 0; l < blocksize; l++) {
+				for(m = 0; m < blocksize; m++) {
+					int bd = scaled_data_p[(s+l)*w/2+t+m];
+
+					domain_sum += bd;
+					domain_disp += bd*bd;
+				}
+			}
+			domain_disp = domain_disp*(int)(blocksize*blocksize)-domain_sum*domain_sum;
+
+			for(tr = 0; tr < 8; tr++) {
+				unsigned int cur_diff = 0;
+				int u_int, v_int;
+
+				if(domain_disp == 0) {
+					u_int = 0;
+					v_int = (int)( ((range_sum/(blocksize*blocksize))*63+128)/255 );
+				} else {
+					rd_sum = 0;
+					for(l = 0; l < blocksize; l++) {
+						for(m = 0; m < blocksize; m++) {
+							int br = rotated_data_p[tr*blocksize*blocksize+l*blocksize+m];
+							int bd = scaled_data_p[(s+l)*w/2+t+m];
+
+							rd_sum += br*bd;
+						}
+					}
+
+					u_int = (int)( 32*((int)(blocksize*blocksize)*rd_sum-range_sum*domain_sum)/domain_disp ); // яркость в интервале [0, 1.0), т.е. от [0 до 32)
+					// av_u += u_int; nof_u++;
+					//if(u_int < 0) {/*printf("u = %f\n", (float)(u_int/51.0));*/u_int = -u_int;} if(u_int > 255) {/*printf("u = %f\n", (float)(u_int/51.0));*/u_int = 255;}
+					//if(u_int < 0 || u_int > 255) continue;
+					if(u_int < 0) u_int = 0; if(u_int > 31) u_int = 31;
+					v_int = (int)( (range_sum-domain_sum*u_int/32)/(int)(blocksize*blocksize) );
+					if(v_int < -255 || v_int > 255) continue;
+					if(v_int < 0)
+						v_int = (v_int*63-128)/255;
+					else
+						v_int = (v_int*63+128)/255;
+
+					//if(v_int < -255) v_int = -255; if(v_int > 255) v_int = 255;
+					//printf("resulted u %d v %d\n", u_int, v_int);
+				}
+
+				for(l = 0; l < blocksize; l++) {
+					for(m = 0; m < blocksize; m++) {
+						int br = rotated_data_p[tr*blocksize*blocksize+l*blocksize+m];
+						int bd = scaled_data_p[(s+l)*w/2+t+m];
+						int mult;
+
+						mult = (bd*u_int/32+v_int*255/63-br);
+						cur_diff += mult*mult;
+					}
+				}
+				//printf("%d < %d\n", cur_diff, best_diff);
+				if(cur_diff < best_diff) {
+					best_diff = cur_diff;
+					block_p->dx = t;
+					block_p->dy = s;
+					block_p->tr = tr;
+					block_p->u = u_int;
+					block_p->v = v_int;
+				}
+
+				if(domain_disp == 0 || best_diff == 0)
+					break;
+			}
+			/*if(t%10 == 0)
+				printf("- %d dbs ok\n", i*max_dby*max_dbx+s*max_dbx+t);*/
+			if(best_diff == 0)
+				break;
+		}
+
+		if(best_diff == 0)
+			break;
+	}
+
+	if(best_diff > worst_diff && blocksize > 2 && blocksize%2 == 0) {
+		printf("best_diff %d/%d (=rms^2*blocksize^2), divided block size %d to %d\n", best_diff, worst_diff, blocksize, blocksize/2);
+
+		fiFindBestDomainBlock(rx, ry, // Позиция нового блока блока
+			(image_rangeblock_type *)block_p->divided_into, // Указатель на новый блок
+			blocksize/2, // Уменьшаем размер блока на 2
+			w, h, scaled_data_p, data_p,
+			worst_diff/4); // Уменьшаем разницу на 4 (т.к. новый блок будет в 4 раза меньше)
+
+		fiFindBestDomainBlock(rx+blocksize/2, ry, // Позиция нового блока блока
+			(image_rangeblock_type *)block_p->divided_into+1, // Указатель на новый блок
+			blocksize/2, // Уменьшаем размер блока на 2
+			w, h, scaled_data_p, data_p,
+			worst_diff/4); // Уменьшаем разницу на 4 (т.к. новый блок будет в 4 раза меньше)
+
+		fiFindBestDomainBlock(rx, ry+blocksize/2, // Позиция нового блока блока
+			(image_rangeblock_type *)block_p->divided_into+2, // Указатель на новый блок
+			blocksize/2, // Уменьшаем размер блока на 2
+			w, h, scaled_data_p, data_p,
+			worst_diff/4); // Уменьшаем разницу на 4 (т.к. новый блок будет в 4 раза меньше)
+
+		fiFindBestDomainBlock(rx+blocksize/2, ry+blocksize/2, // Позиция нового блока блока
+			(image_rangeblock_type *)block_p->divided_into+3, // Указатель на новый блок
+			blocksize/2, // Уменьшаем размер блока на 2
+			w, h, scaled_data_p, data_p,
+			worst_diff/4); // Уменьшаем разницу на 4 (т.к. новый блок будет в 4 раза меньше)
+
+		block_p->is_divided = true;
+	}
+}
+
+#ifdef WIN32
+unsigned int __stdcall fiWorkerFunc(void *arg)
+#else
+void *fiWorkerFunc(void *arg)
+#endif
+{
+	unsigned int i;
+	unsigned int rx, ry, channel;
+
+	unsigned char *scaled_data = ((fi_worker_arg_type *)arg)->scaled_data;
+	unsigned char **data = ((fi_worker_arg_type *)arg)->data;
+	image_rangeblock_type *block_start_p = ((fi_worker_arg_type *)arg)->block_p, *block_p;
+	unsigned int nof_blocks_per_channel = ((fi_worker_arg_type *)arg)->nof_blocks_per_channel;
+	unsigned int w = ((fi_worker_arg_type *)arg)->w;
+	unsigned int h = ((fi_worker_arg_type *)arg)->h;
+	unsigned int nof_channels = ((fi_worker_arg_type *)arg)->nof_channels;
+	unsigned int alloc_memory = ((fi_worker_arg_type *)arg)->alloc_memory;
+	unsigned int blocksize = ((fi_worker_arg_type *)arg)->blocksize;
+	unsigned int worker_start = ((fi_worker_arg_type *)arg)->worker_start;
+	unsigned int worker_step = ((fi_worker_arg_type *)arg)->worker_step;
+	unsigned int worst_diff = ((fi_worker_arg_type *)arg)->worst_diff;
+
+	unsigned int nof_blocks_per_image = nof_blocks_per_channel*nof_channels;
+
+	block_p = block_start_p+worker_start;
+
+	for(i = worker_start; i < nof_blocks_per_image; i += worker_step) {
+		channel = i / nof_blocks_per_channel;
+		rx = ((i % nof_blocks_per_channel) % (w/blocksize)) * blocksize;
+		ry = ((i % nof_blocks_per_channel) / (w/blocksize)) * blocksize;
+
+		if(((i-worker_start)/worker_step)%10 == 0)
+			printf("worker %d %d rbs (%f) ok\n", worker_start+1, (i-worker_start)/worker_step,
+				(float)( 100.0 * (double)((i-worker_start)/worker_step) / (double)((nof_blocks_per_image-worker_start-1)/worker_step+1) ));
+
+		fiFindBestDomainBlock(rx, ry,
+			block_p, blocksize,
+			w, h,
+			scaled_data+channel*alloc_memory/4,
+			data[channel],
+			worst_diff);
+
+		block_p += worker_step;
+	}
+
+	return 0;
+}
diff --git a/src/decompress.c b/src/decompress.c
new file mode 100644
index 0000000..a6a1115
--- /dev/null
+++ b/src/decompress.c
@@ -0,0 +1,93 @@
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "fi_load.h"
+#include "image_misc.h"
+#include "tga_save.h"
+
+int main(int argc, char **argv)
+{
+	char *input_filename, *output_filename;
+	image_pc_type image_pc;
+	image_uc_type image_uc;
+	unsigned int i, scale;
+	char *program_name;
+
+	program_name = strrchr(argv[0], '\\');
+
+	if(program_name == 0)
+		program_name = strrchr(argv[0], '/');
+
+	if(program_name == 0)
+		program_name = argv[0];
+	else
+		program_name++;
+
+	if(argc < 3) {
+		printf("\t%s input.fi output.tga [scale=1]\n\n"
+			"\t\tYou must specify input and output filenames\n", program_name);
+
+		return 0;
+	}
+	input_filename = argv[1];
+	output_filename = argv[2];
+	if(argc > 3) {
+		scale = abs(atoi(argv[3]));
+		if(scale < 1)
+			scale = 1;
+
+		printf("Scale changed to %u\n", scale);
+	} else
+		scale = 1;
+
+	switch(fiLoad(input_filename, &image_uc, scale)) {
+		case FILOAD_OKAY:
+			break;
+		case FILOAD_CANTOPENFILE:
+			printf("Can\'t open %s\n", input_filename);
+			return 1;
+		case FILOAD_DAMAGEDFILE:
+			printf("Can\'t open %s, file is damaged\n", input_filename);
+			return 1;
+		case FILOAD_MEMORYALLOCERROR:
+			printf("Can\'t open %s, memory allocation error\n", input_filename);
+			return 1;
+		default:
+			printf("Undefined error while opening %s\n", input_filename);
+			return 1;
+	}
+
+	YCBCRtoBGR(&image_uc);
+
+	if(!PackChannels(&image_uc, &image_pc)) {
+		printf("Can\'t unpack image channels\n");
+		return 3;
+	}
+
+	for(i = 0; i < image_uc.nof_channels; i++)
+		free(image_uc.data[i]);
+	free(image_uc.data);
+
+	switch(tgaSave(output_filename, &image_pc)) {
+		case TGASAVE_OKAY:
+			break;
+		case TGASAVE_CANTOPENFILE:
+			printf("Can\'t save %s\n", output_filename);
+			return 4;
+		case TGASAVE_DAMAGEDIMAGESTRUCT:
+			printf("Can\'t save %s, internal error, damaged image structure\n", output_filename);
+			return 4;
+		case TGASAVE_TOOMANYCHANNELS:
+			printf("Can\'t save %s, too many channels in input file\n", output_filename);
+			return 4;
+		default:
+			printf("Undefined error while saving %s\n", output_filename);
+			return 4;
+	}
+
+	free(image_pc.data);
+
+	return 0;
+}
diff --git a/src/fi_file.h b/src/fi_file.h
new file mode 100644
index 0000000..8faa637
--- /dev/null
+++ b/src/fi_file.h
@@ -0,0 +1,15 @@
+
+#ifndef _FI_FILE_H
+#define _FI_FILE_H
+
+#define FI_SIGN 0xC0F1
+
+typedef struct {
+	unsigned short sign; // Сигнатура
+	unsigned short blocksize; // Размер блока. if(blocksize == 0) blocksize = 8;
+	unsigned int w;
+	unsigned int h;
+	unsigned int noc; // Количество каналов
+} FI_HEADER;
+
+#endif
diff --git a/src/fi_load.c b/src/fi_load.c
new file mode 100644
index 0000000..f67f0be
--- /dev/null
+++ b/src/fi_load.c
@@ -0,0 +1,469 @@
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "fi_file.h"
+#include "fi_load.h"
+#include "image_misc.h"
+#include "arrays.h"
+
+static size_t GetFileLength(FILE *f)
+{
+	size_t cur_pos, file_len;
+
+	cur_pos = ftell(f);
+	fseek(f, 0, SEEK_END);
+	file_len = ftell(f);
+	fseek(f, cur_pos, SEEK_SET);
+
+	return file_len;
+}
+
+static bool fiDecompressBlocks(unsigned char *cb, unsigned int cb_size, image_rangeblock_type *b, unsigned int w, unsigned int h, unsigned int blocksize, unsigned int channels, unsigned int scale, unsigned int dxy_bitssize);
+static unsigned int fiCoverRangeBlock(image_rangeblock_type *block_p, unsigned int blocksize, unsigned char *range_p, unsigned char *prev_data, unsigned int w, unsigned int h);
+
+int fiLoad(char *fname, image_uc_type *image, unsigned int scale)
+{
+	FILE *f;
+	FI_HEADER head;
+	size_t file_len;
+	unsigned int i, j, k, channels, alloc_memory, nof_blocks, blocksize, compressed_blocks_size;
+	unsigned char *prev_data; // Данные об изображении на предыдущей итерации
+	unsigned char *compressed_blocks;
+	unsigned int dxy_bitssize, cblock_bitssize; // Размеры dx, dy и всего блока в битах
+	image_rangeblock_type *blocks;
+
+	if(scale < 1)
+		scale = 1;
+
+	f = fopen(fname, "rb");
+
+	if(!f)
+		return FILOAD_CANTOPENFILE;
+
+	file_len = GetFileLength(f);
+
+	if(fread(&head, sizeof(FI_HEADER), 1, f) != 1)
+		return FILOAD_DAMAGEDFILE;
+
+	if(head.sign != FI_SIGN)
+		return FILOAD_DAMAGEDFILE;
+
+	if(head.blocksize < 2)
+		return FILOAD_DAMAGEDFILE;
+
+	blocksize = head.blocksize;
+
+	if((head.w%blocksize) != 0 || (head.h%blocksize) != 0 || head.w < (blocksize*2) || head.h < (blocksize*2)) {
+		return FILOAD_DAMAGEDFILE;
+	}
+
+	GetDxyBitsSize(head.w, head.h, &dxy_bitssize);
+	cblock_bitssize = GetCblockBitsSize(dxy_bitssize);
+
+	image->w = head.w*scale;
+	image->h = head.h*scale;
+	image->nof_channels = channels = head.noc;
+	image->alloc_memory = alloc_memory = image->w*image->h;
+	blocksize *= scale;
+	nof_blocks = image->w*image->h*image->nof_channels/(blocksize*blocksize);
+
+	// Тут начинается код выделения памяти
+	compressed_blocks = malloc((cblock_bitssize*image->w*image->h*image->nof_channels/4+nof_blocks*(blocksize*blocksize-1)/3)/8+1);
+	printf("compressed_blocks = malloc(%u)\n", (cblock_bitssize*image->w*image->h*image->nof_channels/4+nof_blocks*(blocksize*blocksize-1)/3)/8+1);
+	// Из рассчёта, что все блоки 2х2 и имеют размер cblock_bitssize бит
+	// + добавляем дополнительные биты разбиения блоков
+	// Каждый блок может быть разбит на 4. Т.е. для блока размера n (если n - степень двойки) у нас есть s=1+4+16+32+...+pow(4, log2(n)-1)
+	// Т.е. s = 1*(1-pow(4,log2(n)))/(1-4) = (pow(2*2, log2(n))-1)/3 = (n*n-1)/3
+	// Для n не в степени двойки (т.е. вида pow(2,x)*y) будем иметь s=1+4+16+32+...+pow(4, log2(n/y))
+	// Т.е. s = 1*(1-pow(4,log2(n/y)))/(1-4) = (pow(2*2, log2(n/y))-1)/3 = ((n/y)*(n/y)-1)/3, что меньше чем (n*n-1)/3.
+	// Таким образом, достаточно рассмотреть случай, когда n в степени двойки.
+	// В нашем случае получаем nof_blocks*(blocksize*blocksize-1)/3 бит
+	// + 1 байт на случай, если количество бит не кратно 8
+	if(!compressed_blocks)
+		return FILOAD_MEMORYALLOCERROR;
+
+	// Немного кода для чтения блоков из файла
+	compressed_blocks_size = file_len-sizeof(FI_HEADER);
+	printf("compressed_blocks_size %u\n", compressed_blocks_size);
+	if(fread(compressed_blocks, 1, compressed_blocks_size, f) != compressed_blocks_size) {
+		free(compressed_blocks);
+
+		return FILOAD_DAMAGEDFILE;
+	}
+	fclose(f);
+
+	// И снова начинается код выделения памяти
+	blocks = malloc(sizeof(image_rangeblock_type)*nof_blocks*(blocksize*blocksize-1)/3); // (blocksize*blocksize-1)/3 - количество разбиений одного блока в худшем случае
+	printf("blocks = malloc(%u)\n", (unsigned int)(sizeof(image_rangeblock_type)*nof_blocks*(blocksize*blocksize-1)/3));
+	if(!blocks) {
+		free(compressed_blocks);
+		return FILOAD_MEMORYALLOCERROR;
+	}
+	SetBlocksPointers(blocks, blocksize, nof_blocks, image->w, image->h);
+
+	prev_data = malloc(alloc_memory/4);
+	printf("prev_data = malloc(%u)\n", alloc_memory/4);
+	if(!prev_data) {
+		free(compressed_blocks);
+		free(blocks);
+		return FILOAD_MEMORYALLOCERROR;
+	}
+
+	image->data = malloc(channels*sizeof(void *));
+	printf("image->data = malloc(%u)\n", (unsigned int)(channels*sizeof(void *)));
+	if(!image->data) {
+		free(compressed_blocks);
+		free(blocks);
+		free(prev_data);
+		return FILOAD_MEMORYALLOCERROR;
+	}
+
+	for(i = 0; i < channels; i++) {
+		image->data[i] = malloc(alloc_memory);
+		printf("image->data[%u] = malloc(%u)\n",  i, alloc_memory);
+		if(!image->data[i]) {
+			unsigned int j;
+
+			for(j = 0; j < i; j++)
+				free(image->data[j]);
+			free(image->data);
+			free(compressed_blocks);
+			free(blocks);
+			free(prev_data);
+
+			return FILOAD_MEMORYALLOCERROR;
+		}
+		memset(image->data[i], 0, alloc_memory);
+	}
+	// А тут он заканчивается
+
+	if(!fiDecompressBlocks(compressed_blocks, compressed_blocks_size, blocks, head.w, head.h, head.blocksize, head.noc, scale, dxy_bitssize)) {
+		free(compressed_blocks);
+		free(blocks);
+		free(prev_data);
+		for(i = 0; i < channels; i++)
+			free(image->data[i]);
+		free(image->data);
+
+		return FILOAD_DAMAGEDFILE;
+	}
+	free(compressed_blocks);
+
+	for(i = 0; i < channels; i++) {
+		unsigned char *data_p;
+		unsigned int nof_iterations = 0;
+
+		data_p = image->data[i];
+
+		while(1) {
+			unsigned long long diff = 0;
+			image_rangeblock_type *block_p;
+
+			block_p = blocks+i*nof_blocks/image->nof_channels;
+			Scale2to1(data_p, prev_data, image->w, image->h);
+			for(j = 0; j < image->h/blocksize; j++) {
+				for(k = 0; k < image->w/blocksize; k++) {
+					unsigned char *range_p;
+
+					range_p = &data_p[j*blocksize*image->w+k*blocksize];
+
+					diff += fiCoverRangeBlock(block_p, blocksize, range_p, prev_data, image->w, image->h);
+
+					block_p++;
+				}
+			}
+
+			diff /= image->w*image->h;
+			printf("diff %llu\n", diff);
+			nof_iterations++;
+			if(nof_iterations >= 1024 || diff == 0)
+				break;
+		}
+		printf("\nchannel %d completed after %d iterations\n", i+1, nof_iterations);
+	}
+
+	free(blocks);
+	free(prev_data);
+
+	return FILOAD_OKAY;
+}
+
+static unsigned int fiCoverRangeBlock(image_rangeblock_type *block_p, unsigned int blocksize, unsigned char *range_p, unsigned char *prev_data, unsigned int w, unsigned int h)
+{
+	unsigned int l, m;
+	unsigned int diff = 0;
+
+	//printf("%d", block_p->tr);
+	if(block_p->is_divided) {
+		//printf("Quadtrees unimplemented in decoder!11\n");
+		//range_p = &data_p[j*blocksize*image->w+k*blocksize];
+		//domain_p = &prev_data[block_p->dy*image->w/2+block_p->dx];
+
+		diff += fiCoverRangeBlock((image_rangeblock_type *)block_p->divided_into, blocksize/2, range_p, prev_data, w, h);
+		diff += fiCoverRangeBlock((image_rangeblock_type *)block_p->divided_into+1, blocksize/2, range_p+blocksize/2, prev_data, w, h);
+		diff += fiCoverRangeBlock((image_rangeblock_type *)block_p->divided_into+2, blocksize/2, range_p+w*blocksize/2, prev_data, w, h);
+		diff += fiCoverRangeBlock((image_rangeblock_type *)block_p->divided_into+3, blocksize/2, range_p+(w+1)*blocksize/2, prev_data, w, h);
+	} else {
+		unsigned char *domain_p;
+
+		domain_p = &prev_data[block_p->dy*w/2+block_p->dx];
+
+		switch(block_p->tr) {
+			case IMAGE_RB_TRANSFORMATION_NONE:
+				for(l = 0; l < blocksize; l++) {
+					for(m = 0; m < blocksize; m++) {
+						int loc_diff, res = (int)(*domain_p)*(int)block_p->u/32+(int)block_p->v;
+
+
+						if(res < 0) res = 0;if(res > 255) res = 255;
+
+						loc_diff = (int)(*range_p)-res;
+						diff += loc_diff*loc_diff;
+
+						*range_p = res;
+
+						range_p++;
+						domain_p++;
+					}
+
+					range_p += w-blocksize;
+					domain_p += w/2-blocksize;
+				}
+
+				break;
+			case IMAGE_RB_TRANSFORMATION_90RIGHT:
+				for(l = 0; l < blocksize; l++) {
+					for(m = 0; m < blocksize; m++) {
+						int loc_diff, res = (int)(domain_p[(blocksize-1-m)*w/2+l])*(int)block_p->u/32+(int)block_p->v;
+
+						if(res < 0) res = 0;if(res > 255) res = 255;
+
+						loc_diff = (int)(*range_p)-res;
+						diff += loc_diff*loc_diff;
+
+						*range_p = res;
+
+						range_p++;
+					}
+
+					range_p += w-blocksize;
+				}
+
+				break;
+			case IMAGE_RB_TRANSFORMATION_180RIGHT:
+				for(l = 0; l < blocksize; l++) {
+					for(m = 0; m < blocksize; m++) {
+						int loc_diff, res = (int)(domain_p[(blocksize-1-l)*w/2+blocksize-1-m])*(int)block_p->u/32+(int)block_p->v;
+
+						if(res < 0) res = 0;if(res > 255) res = 255;
+
+						loc_diff = (int)(*range_p)-res;
+						diff += loc_diff*loc_diff;
+
+						*range_p = res;
+
+						range_p++;
+					}
+
+					range_p += w-blocksize;
+				}
+
+				break;
+			case IMAGE_RB_TRANSFORMATION_270RIGHT:
+				for(l = 0; l < blocksize; l++) {
+					for(m = 0; m < blocksize; m++) {
+						int loc_diff, res = (int)(domain_p[m*w/2+blocksize-1-l])*(int)block_p->u/32+(int)block_p->v;
+
+						if(res < 0) res = 0;if(res > 255) res = 255;
+
+						loc_diff = (int)(*range_p)-res;
+						diff += loc_diff*loc_diff;
+
+						*range_p = res;
+
+						range_p++;
+					}
+
+					range_p += w-blocksize;
+				}
+				break;
+			case IMAGE_RB_TRANSFORMATION_VFLIP:
+				for(l = 0; l < blocksize; l++) {
+					for(m = 0; m < blocksize; m++) {
+						int loc_diff, res = (int)(domain_p[(blocksize-1-l)*w/2+m])*(int)block_p->u/32+(int)block_p->v;
+
+						if(res < 0) res = 0;if(res > 255) res = 255;
+
+						loc_diff = (int)(*range_p)-res;
+						diff += loc_diff*loc_diff;
+
+						*range_p = res;
+
+						range_p++;
+					}
+
+					range_p += w-blocksize;
+				}
+				break;
+			case IMAGE_RB_TRANSFORMATION_HFLIP:
+				for(l = 0; l < blocksize; l++) {
+					for(m = 0; m < blocksize; m++) {
+						int loc_diff, res = (int)(domain_p[l*w/2+blocksize-1-m])*(int)block_p->u/32+(int)block_p->v;
+
+						if(res < 0) res = 0;if(res > 255) res = 255;
+
+						loc_diff = (int)(*range_p)-res;
+						diff += loc_diff*loc_diff;
+
+						*range_p = res;
+
+						range_p++;
+					}
+
+					range_p += w-blocksize;
+				}
+
+				break;
+			case IMAGE_RB_TRANSFORMATION_MAINDIAGFLIP:
+				for(l = 0; l < blocksize; l++) {
+					for(m = 0; m < blocksize; m++) {
+						int loc_diff, res = (int)(domain_p[m*w/2+l])*(int)block_p->u/32+(int)block_p->v;
+
+						if(res < 0) res = 0;if(res > 255) res = 255;
+
+						loc_diff = (int)(*range_p)-res;
+						diff += loc_diff*loc_diff;
+
+						*range_p = res;
+
+						range_p++;
+					}
+
+					range_p += w-blocksize;
+				}
+
+				break;
+			case IMAGE_RB_TRANSFORMATION_ANTIDIAGFLIP:
+				for(l = 0; l < blocksize; l++) {
+					for(m = 0; m < blocksize; m++) {
+						int loc_diff, res = (int)(domain_p[(blocksize-1-m)*w/2+blocksize-1-l])*(int)block_p->u/32+(int)block_p->v;
+
+						if(res < 0) res = 0;if(res > 255) res = 255;
+
+						loc_diff = (int)(*range_p)-res;
+						diff += loc_diff*loc_diff;
+
+						*range_p = res;
+
+						range_p++;
+					}
+
+					range_p += w-blocksize;
+				}
+
+				break;
+		}
+	}
+
+	return diff;
+}
+
+static bool fiDecompressOneBlock(array_type *cbarray, image_rangeblock_type *bp, unsigned int blocksize, unsigned int w, unsigned int h, unsigned int scale, unsigned int dxy_bitssize)
+{
+	unsigned int is_block_divided, dxy, tr, v, u;
+
+	if(((blocksize % 2) != 1) && (blocksize > 3)) {
+		if(!arrayReadBits(cbarray, 1, &is_block_divided))
+			return false;
+	} else
+		is_block_divided = 0;
+
+	if(is_block_divided) {
+		if(!fiDecompressOneBlock(cbarray, (image_rangeblock_type *)bp->divided_into, blocksize/2, w, h, scale, dxy_bitssize))
+			return false;
+
+		if(!fiDecompressOneBlock(cbarray, (image_rangeblock_type *)bp->divided_into+1, blocksize/2, w, h, scale, dxy_bitssize))
+			return false;
+
+		if(!fiDecompressOneBlock(cbarray, (image_rangeblock_type *)bp->divided_into+2, blocksize/2, w, h, scale, dxy_bitssize))
+			return false;
+
+		if(!fiDecompressOneBlock(cbarray, (image_rangeblock_type *)bp->divided_into+3, blocksize/2, w, h, scale, dxy_bitssize))
+			return false;
+
+		bp->is_divided = true;
+	} else {
+		if(!arrayReadBits(cbarray, 5, &u))
+			return false;
+		bp->u = u;
+
+		if(bp->u > 0) {
+			if(!arrayReadBits(cbarray, 7, &v))
+				return false;
+			if(v & 64)
+				bp->v = -1;
+			else
+				bp->v = 1;
+			bp->v *= (int)v&63;
+			bp->v = (bp->v*255)/63;
+
+			if(!arrayReadBits(cbarray, dxy_bitssize, &dxy))
+				return false;
+
+			bp->dx = dxy%(w/2-1);
+			if(bp->dx > w/2-blocksize)
+				return false;
+			bp->dx *= scale;
+
+			bp->dy = dxy/(w/2-1);
+			if(bp->dy > h/2-blocksize)
+				return false;
+			bp->dy *= scale;
+
+			if(!arrayReadBits(cbarray, 3, &tr))
+				return false;
+			bp->tr = tr;
+		} else {
+			if(!arrayReadBits(cbarray, 6, &v))
+				return false;
+			bp->v = v&255;
+			bp->v = (bp->v*255)/63;
+
+			bp->dx = 0;
+			bp->dy = 0;
+			bp->tr = 0;
+		}
+
+		bp->is_divided = false;
+	}
+
+	return true;
+}
+
+static bool fiDecompressBlocks(unsigned char *cb, unsigned int cb_size, image_rangeblock_type *b, unsigned int w, unsigned int h, unsigned int blocksize, unsigned int channels, unsigned int scale, unsigned int dxy_bitssize)
+{
+	unsigned int i, j, k;
+	image_rangeblock_type *bp;
+	array_type cbarray;
+
+	cbarray.bp = cbarray.bytes = cb;
+	cbarray.bytes_offset = cbarray.bits_offset = 0;
+	cbarray.bytes_size = cb_size;
+
+	//memcpy(b, cb, nof_blocks*IMAGE_COMPRESSEDRB_SIZE);
+
+	bp = b;
+	for(i = 0; i < channels; i++) {
+		for(j = 0; j < h/blocksize; j++) {
+			for(k = 0; k < w/blocksize; k++) {
+				if(!fiDecompressOneBlock(&cbarray, bp, blocksize, w, h, scale, dxy_bitssize))
+					return false;
+				bp++;
+			}
+		}
+	}
+
+	return true;
+}
diff --git a/src/fi_load.h b/src/fi_load.h
new file mode 100644
index 0000000..f1983f9
--- /dev/null
+++ b/src/fi_load.h
@@ -0,0 +1,14 @@
+
+#ifndef _FI_LOAD_H
+#define _FI_LOAD_H
+
+#include "image.h"
+
+#define FILOAD_OKAY 0
+#define FILOAD_CANTOPENFILE 1
+#define FILOAD_DAMAGEDFILE 2
+#define FILOAD_MEMORYALLOCERROR 3
+
+extern int fiLoad(char *fname, image_uc_type *image, unsigned int scale);
+
+#endif
diff --git a/src/fi_save.c b/src/fi_save.c
new file mode 100644
index 0000000..7d564c6
--- /dev/null
+++ b/src/fi_save.c
@@ -0,0 +1,134 @@
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+#ifdef I_USE_MPI
+#include <mpi.h>
+#endif
+
+#include "arrays.h"
+#include "fi_file.h"
+#include "fi_save.h"
+#include "fi_save_blocks_compression.h"
+#include "fi_save_blocks_search.h"
+#include "image_misc.h"
+
+static int fiConvertImage(image_uc_type *image, fi_compresseddata_type *cdat, unsigned int blocksize, unsigned int nof_threads, unsigned int worst_diff);
+static int fiSaveCompressedToFile(char *fname, fi_compresseddata_type *cdat);
+
+int fiSave(char *fname, image_uc_type *image, unsigned int blocksize, unsigned int nof_threads, unsigned int worst_diff)
+{
+	int result;
+	fi_compresseddata_type cdat;
+	clock_t compression_time;
+#ifdef I_USE_MPI
+	int my_rank;
+#endif
+
+	compression_time = clock();
+	result = fiConvertImage(image, &cdat, blocksize, nof_threads, worst_diff);
+	compression_time = clock()-compression_time;
+	printf("compression time %d ms\n", (int)compression_time);
+
+	if(result != FISAVE_OKAY)
+		return result;
+
+#ifdef I_USE_MPI
+	MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
+
+	if(my_rank == 0)
+		return fiSaveCompressedToFile(fname, &cdat);
+	else
+		return FISAVE_OKAY;
+#else
+	return fiSaveCompressedToFile(fname, &cdat);
+#endif
+}
+
+static int fiConvertImage(image_uc_type *image, fi_compresseddata_type *cdat, unsigned int blocksize, unsigned int nof_threads, unsigned int worst_diff)
+{
+	FI_HEADER head;
+	unsigned int result;
+	fi_data_type data;
+#ifdef I_USE_MPI
+	int my_rank, nof_procs;
+#endif
+
+	result = fiFindBestDomainBlocks(image, blocksize, nof_threads, worst_diff, &data);
+	if(result != FISAVE_OKAY)
+		return result;
+
+#ifdef I_USE_MPI
+	MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &nof_procs);
+	if(nof_procs > 1) {
+		image_rangeblock_type *resulted_blocks = 0;
+
+		if(my_rank == 0) {
+			resulted_blocks = malloc(sizeof(image_rangeblock_type)*data.nof_blocks*(blocksize*blocksize-1)/3); // Почему такой размер - см. fi_save_blocks_search.c
+
+			if(!resulted_blocks) {
+				result = FISAVE_MEMORYALLOCERROR;
+				goto EXIT_STATE;
+			}
+		}
+
+		MPI_Barrier(MPI_COMM_WORLD);
+
+		// Склейка блоков. Почему такой размер - см. fi_save_blocks_search.c
+		MPI_Reduce(data.blocks, resulted_blocks, sizeof(image_rangeblock_type)*data.nof_blocks*(blocksize*blocksize-1)/3, MPI_BYTE, MPI_BOR, 0, MPI_COMM_WORLD);
+
+		if(my_rank == 0) {
+			free(data.blocks);
+			data.blocks = resulted_blocks;
+
+			// Пересчитываем указатели, так как при вызове MPI_Reduce
+			SetBlocksPointers(data.blocks, blocksize, data.nof_blocks, image->w, image->h);
+		}
+	}
+
+	if(my_rank > 0) {
+		cdat->len = 0;
+		result = FISAVE_OKAY;
+
+		goto EXIT_STATE;
+	}
+#endif
+
+	// Сжимаем полученные блоки для записи в файл
+	head.sign = FI_SIGN;
+	head.blocksize = blocksize;
+	head.noc = image->nof_channels;
+	head.w = image->w;
+	head.h = image->h;
+	result = fiCompressBlocksAndAddHeader(&head ,cdat, data.blocks, data.nof_blocks);
+
+#ifdef I_USE_MPI
+EXIT_STATE:
+#endif
+
+	free(data.blocks);
+
+	return result;
+}
+
+static int fiSaveCompressedToFile(char *fname, fi_compresseddata_type *cdat)
+{
+	FILE *f;
+
+	f = fopen(fname, "wb");
+	if(!f) {
+		free(cdat->data);
+		return FISAVE_CANTOPENFILE;
+	}
+
+	fwrite(cdat->data, 1, cdat->len, f);
+
+	fclose(f);
+	free(cdat->data);
+	cdat->len = 0;
+
+	return FISAVE_OKAY;
+}
diff --git a/src/fi_save.h b/src/fi_save.h
new file mode 100644
index 0000000..0f44d51
--- /dev/null
+++ b/src/fi_save.h
@@ -0,0 +1,15 @@
+
+#ifndef _FI_SAVE_H
+#define _FI_SAVE_H
+
+#include "image.h"
+
+#define FISAVE_OKAY 0
+#define FISAVE_CANTOPENFILE 1
+#define FISAVE_WRONGIMAGESIZE 2
+#define FISAVE_DAMAGEDIMAGESTRUCT 3
+#define FISAVE_MEMORYALLOCERROR 4
+
+extern int fiSave(char *fname, image_uc_type *image, unsigned int blocksize, unsigned int nof_threads, unsigned int worst_diff);
+
+#endif
diff --git a/src/fi_save_blocks_compression.c b/src/fi_save_blocks_compression.c
new file mode 100644
index 0000000..3589a0f
--- /dev/null
+++ b/src/fi_save_blocks_compression.c
@@ -0,0 +1,111 @@
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "arrays.h"
+#include "fi_file.h"
+#include "fi_save.h"
+#include "image_misc.h"
+
+#include "fi_save_blocks_compression.h"
+
+static bool fiCompressOneBlock(array_type *cbarray, image_rangeblock_type *b, unsigned int blocksize, unsigned int w, unsigned int h, unsigned int dxy_bitssize);
+
+static unsigned int g_statistics_nofdivisions, g_statistics_nofblocks;
+
+int fiCompressBlocksAndAddHeader(FI_HEADER *head ,fi_compresseddata_type *cdat, image_rangeblock_type *blocks, unsigned int nof_blocks)
+{
+	unsigned int i;
+	unsigned int dxy_bitssize, cblock_bitssize; // Размеры dx, dy и всего блока в битах
+	array_type cbarray;
+
+	GetDxyBitsSize(head->w, head->h, &dxy_bitssize);
+	cblock_bitssize = GetCblockBitsSize(dxy_bitssize);
+
+	cbarray.bytes_offset = cbarray.bits_offset = 0;
+	cbarray.bytes_size = (cblock_bitssize*head->w*head->h*head->noc/4+nof_blocks*(head->blocksize*head->blocksize-1)/3)/8+1;
+	// Из рассчёта, что все блоки 2х2 и имеют размер cblock_bitssize бит
+	// + добавляем дополнительные биты разбиения блоков (см. fi_save_blocks_search.c)
+	// + 1 байт на случай, если количество бит не кратно 8
+
+	cdat->data = malloc(sizeof(FI_HEADER)+cbarray.bytes_size);
+	if(!cdat->data) {
+		return FISAVE_MEMORYALLOCERROR;
+	}
+	memcpy(cdat->data, head, sizeof(FI_HEADER));
+	
+	cbarray.bp = cbarray.bytes = cdat->data+sizeof(FI_HEADER);
+	memset(cbarray.bytes, 0, cbarray.bytes_size);
+
+	g_statistics_nofdivisions = 0;
+	g_statistics_nofblocks = 0;
+
+	for(i = 0; i < nof_blocks; i++) {
+		if(!fiCompressOneBlock(&cbarray, blocks+i, head->blocksize, head->w, head->h, dxy_bitssize)) {
+			free(cdat->data);
+			return FISAVE_MEMORYALLOCERROR;
+		}
+	}
+
+	cdat->len = sizeof(FI_HEADER)+cbarray.bytes_offset+((cbarray.bits_offset>0)?1:0);
+
+	printf("statistics:\n\t%d blocks ~ %d bytes\n", g_statistics_nofblocks, (g_statistics_nofblocks*(cblock_bitssize+1)+7)/8);
+	printf("\t%d divisions - %d bytes\n", g_statistics_nofdivisions, (g_statistics_nofdivisions+7)/8);
+	printf("\ttotal ~ %d bytes\n", (g_statistics_nofblocks*(cblock_bitssize+1)+g_statistics_nofdivisions+7)/8);
+	printf("\tcbarray bytes_offset - %d bits_offset - %d\n", cbarray.bytes_offset, cbarray.bits_offset);
+	printf("\tcdat array size - %d bytes\n", (int)cdat->len);
+
+	return FISAVE_OKAY;
+}
+
+static bool fiCompressOneBlock(array_type *cbarray, image_rangeblock_type *b, unsigned int blocksize, unsigned int w, unsigned int h, unsigned int dxy_bitssize)
+{
+	if(b->is_divided) {
+		if(!arrayWriteBits(cbarray, 1, 1))
+			return false;
+
+		if(!fiCompressOneBlock(cbarray, (image_rangeblock_type *)b->divided_into,  blocksize/2, w, h, dxy_bitssize))
+			return false;
+		if(!fiCompressOneBlock(cbarray, (image_rangeblock_type *)b->divided_into+1,  blocksize/2, w, h, dxy_bitssize))
+			return false;
+		if(!fiCompressOneBlock(cbarray, (image_rangeblock_type *)b->divided_into+2,  blocksize/2, w, h, dxy_bitssize))
+			return false;
+		if(!fiCompressOneBlock(cbarray, (image_rangeblock_type *)b->divided_into+3,  blocksize/2, w, h, dxy_bitssize))
+			return false;
+
+		g_statistics_nofdivisions++;
+
+		return true;
+	} else {
+		if(((blocksize % 2) != 1) && (blocksize > 3)) {
+			if(!arrayWriteBits(cbarray, 1, 0))
+				return false;
+		}
+
+		if(!arrayWriteBits(cbarray, 5, (unsigned int)b->u))
+			return false;
+
+		if(b->u > 0) {
+			unsigned int dxy;
+
+			if(!arrayWriteBits(cbarray, 7, (unsigned int)((b->v<0)?64-b->v:b->v)))
+				return false;
+
+			dxy = (w/2-1)*((unsigned int)b->dy)+(unsigned int)b->dx;
+
+			if(!arrayWriteBits(cbarray, dxy_bitssize, dxy))
+				return false;
+
+			if(!arrayWriteBits(cbarray, 3, (unsigned int)b->tr))
+				return false;
+		} else {
+			if(!arrayWriteBits(cbarray, 6, (unsigned int)(b->v&63)))
+				return false;
+		}
+
+		g_statistics_nofblocks++;
+
+		return true;
+	}
+}
diff --git a/src/fi_save_blocks_compression.h b/src/fi_save_blocks_compression.h
new file mode 100644
index 0000000..3dacbb4
--- /dev/null
+++ b/src/fi_save_blocks_compression.h
@@ -0,0 +1,12 @@
+
+#ifndef _FI_SAVE_BLOCKS_COMPRESSION_H
+#define _FI_SAVE_BLOCKS_COMPRESSION_H
+
+typedef struct {
+	unsigned char *data;
+	unsigned int len;
+} fi_compresseddata_type;
+
+extern int fiCompressBlocksAndAddHeader(FI_HEADER *head ,fi_compresseddata_type *cdat, image_rangeblock_type *blocks, unsigned int nof_blocks);
+
+#endif
diff --git a/src/fi_save_blocks_search.c b/src/fi_save_blocks_search.c
new file mode 100644
index 0000000..0f03dc0
--- /dev/null
+++ b/src/fi_save_blocks_search.c
@@ -0,0 +1,171 @@
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#ifdef WIN32
+	#include <Windows.h>
+	#include <process.h>
+#else
+	#include <pthread.h>
+#endif
+
+#ifdef I_USE_MPI
+#include <mpi.h>
+#endif
+
+#include "image_misc.h"
+#include "fi_save.h"
+#include "fi_save_blocks_search.h"
+#include "fi_save_blocks_search_workers.h"
+
+int fiFindBestDomainBlocks(image_uc_type *image, unsigned int blocksize, unsigned int nof_threads, unsigned int worst_diff, fi_data_type *data)
+{
+	unsigned int i, nof_blocks;
+	unsigned char *scaled_data; // Данные об изображении, уменьшенном в 2 раза (для поиска доменных блоков) (Данные идут последовательно для каждого из каналов)
+	image_rangeblock_type *blocks;
+	fi_worker_arg_type *worker_args;
+#ifdef WIN32
+	HANDLE *worker_thread_handles;
+#else
+	pthread_t *worker_thread_handles;
+#endif
+#ifdef I_USE_MPI
+	int my_rank, nof_procs;
+#endif
+
+	if(blocksize < 2 || blocksize > IMAGE_MAXBLOCKSIZE) {
+		return FISAVE_WRONGIMAGESIZE;
+	}
+
+	if((image->w%blocksize) != 0 || (image->h%blocksize) != 0 || image->w < (blocksize*2) || image->h < (blocksize*2)) {
+		return FISAVE_WRONGIMAGESIZE;
+	}
+
+	if(image->w*image->h != image->alloc_memory)
+		return FISAVE_DAMAGEDIMAGESTRUCT;
+
+#ifdef I_USE_MPI
+	MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &nof_procs);
+#endif
+
+	memset(data, 0, sizeof(fi_data_type));
+
+	nof_blocks = image->w*image->h*image->nof_channels/(blocksize*blocksize);
+
+	// Тут начинается код выделения памяти
+	blocks = malloc(sizeof(image_rangeblock_type)*nof_blocks*(blocksize*blocksize-1)/3);
+	printf("blocks = malloc(%u), sizeof(image_rangeblock_type)=%u\n", (unsigned int)(sizeof(image_rangeblock_type)*nof_blocks*(blocksize*blocksize-1)/3), (unsigned int)(sizeof(image_rangeblock_type)));
+	// Каждый блок может быть разбит на 4. Т.е. для блока размера n (если n - степень двойки) у нас есть s=1+4+16+32+...+pow(4, log2(n)-1)
+	// Т.е. s = 1*(1-pow(4,log2(n)))/(1-4) = (pow(2*2, log2(n))-1)/3 = (n*n-1)/3
+	// Для n не в степени двойки (т.е. вида pow(2,x)*y) будем иметь s=1+4+16+32+...+pow(4, log2(n/y))
+	// Т.е. s = 1*(1-pow(4,log2(n/y)))/(1-4) = (pow(2*2, log2(n/y))-1)/3 = ((n/y)*(n/y)-1)/3, что меньше чем (n*n-1)/3.
+	// Таким образом, достаточно рассмотреть случай, когда n в степени двойки.
+	// В нашем случае получаем s=(blocksize*blocksize-1)/3
+	if(!blocks)
+		return FISAVE_MEMORYALLOCERROR;
+	memset(blocks, 0, sizeof(image_rangeblock_type)*nof_blocks*(blocksize*blocksize-1)/3);
+	SetBlocksPointers(blocks, blocksize, nof_blocks, image->w, image->h);
+
+	scaled_data = malloc(image->nof_channels*image->alloc_memory/4);
+	printf("scaled_data = malloc(%u)\n", image->nof_channels*image->alloc_memory/4);
+	if(!scaled_data) {
+		free(blocks);
+		return FISAVE_MEMORYALLOCERROR;
+	}
+	for(i = 0; i < image->nof_channels; i++)
+		Scale2to1(image->data[i], &scaled_data[i*image->alloc_memory/4], image->w, image->h);
+
+	worker_args = malloc(nof_threads*sizeof(fi_worker_arg_type));
+	if(!worker_args) {
+		free(scaled_data);
+		free(blocks);
+		return FISAVE_MEMORYALLOCERROR;
+	}
+#ifdef WIN32
+	worker_thread_handles = malloc(nof_threads*sizeof(HANDLE));
+#else
+	worker_thread_handles = malloc(nof_threads*sizeof(pthread_t));
+#endif
+	if(!worker_thread_handles) {
+		free(worker_args);
+		free(scaled_data);
+		free(blocks);
+		return FISAVE_MEMORYALLOCERROR;
+	}
+	// А тут он заканчивается
+
+	// Заполняем структуру 1-го "рабочего"
+	worker_args[0].scaled_data = scaled_data;
+	worker_args[0].data = image->data;
+	worker_args[0].block_p = blocks;
+	worker_args[0].nof_blocks_per_channel = nof_blocks/image->nof_channels;
+	worker_args[0].w = image->w;
+	worker_args[0].h = image->h;
+	worker_args[0].nof_channels = image->nof_channels;
+	worker_args[0].alloc_memory = image->alloc_memory;
+	worker_args[0].blocksize = blocksize;
+#ifdef I_USE_MPI
+	worker_args[0].worker_start = nof_threads*my_rank;
+	worker_args[0].worker_step = nof_threads*nof_procs;
+#else
+	worker_args[0].worker_start = 0;
+	worker_args[0].worker_step = nof_threads;
+#endif
+	worker_args[0].worst_diff = worst_diff;
+	for(i = 1; i < nof_threads; i++) { // Заполняем структуры остальных
+		memcpy(worker_args+i, worker_args, sizeof(fi_worker_arg_type));
+		worker_args[i].worker_start = worker_args[0].worker_start+i;
+	}
+
+#ifdef WIN32
+	// Создаём рабочие потоки
+	for(i = 0; i < nof_threads; i++) {
+
+		worker_thread_handles[i] = (HANDLE)_beginthreadex(NULL, 0, fiWorkerFunc, worker_args+i, CREATE_SUSPENDED, NULL);//CreateThread(NULL, 0, fiWorkerFunc, worker_args+i, 0, NULL);
+
+		if(!worker_thread_handles[i]) {
+			unsigned int j;
+
+			for(j = 0; j < i; j++) {
+				CloseHandle(worker_thread_handles[j]);
+			}
+
+			free(worker_thread_handles);
+			free(worker_args);
+			free(scaled_data);
+			free(blocks);
+			return FISAVE_MEMORYALLOCERROR;
+		}
+		//fiWorkerFunc(worker_args+i);
+
+	}
+
+	// Вызываем рабочие потоки
+	for(i = 0; i < nof_threads; i++)
+		ResumeThread(worker_thread_handles[i]);
+#else
+	// Создаём рабочие потоки
+	for(i = 0; i < nof_threads; i++)
+		pthread_create(worker_thread_handles+i, NULL, fiWorkerFunc, worker_args+i);
+#endif
+
+#ifdef WIN32
+	WaitForMultipleObjects(nof_threads, worker_thread_handles, TRUE, INFINITE);
+
+	for(i = 0; i < nof_threads; i++)
+		CloseHandle(worker_thread_handles[i]);
+#else
+	for(i = 0; i < nof_threads; i++)
+		pthread_join(worker_thread_handles[i], 0);
+#endif
+
+	free(scaled_data);
+	free(worker_args);
+	free(worker_thread_handles);
+
+	data->blocks = blocks;
+	data->nof_blocks = nof_blocks;
+
+	return FISAVE_OKAY;
+}
diff --git a/src/fi_save_blocks_search.h b/src/fi_save_blocks_search.h
new file mode 100644
index 0000000..0dabfd6
--- /dev/null
+++ b/src/fi_save_blocks_search.h
@@ -0,0 +1,14 @@
+
+#ifndef _FI_SAVE_BLOCKS_SEARCH_H
+#define _FI_SAVE_BLOCKS_SEARCH_H
+
+#include "image.h"
+
+typedef struct {
+	image_rangeblock_type *blocks;
+	unsigned int nof_blocks;
+} fi_data_type;
+
+extern int fiFindBestDomainBlocks(image_uc_type *image, unsigned int blocksize, unsigned int nof_threads, unsigned int worst_diff, fi_data_type *data);
+
+#endif
diff --git a/src/fi_save_blocks_search_workers.h b/src/fi_save_blocks_search_workers.h
new file mode 100644
index 0000000..65bd0c9
--- /dev/null
+++ b/src/fi_save_blocks_search_workers.h
@@ -0,0 +1,30 @@
+
+#ifndef _FI_SAVE_BLOCKS_SEARCH_WORKERS_H
+#define _FI_SAVE_BLOCKS_SEARCH_WORKERS_H
+
+#include "image.h"
+
+typedef struct {
+	unsigned char *scaled_data;
+	unsigned char **data;
+	image_rangeblock_type *block_p;
+	unsigned int nof_blocks_per_channel;
+	unsigned int w;
+	unsigned int h;
+	unsigned int nof_channels;
+	unsigned int alloc_memory;
+	unsigned int blocksize;
+	unsigned int worker_start;
+	unsigned int worker_step;
+	unsigned int worst_diff;
+} fi_worker_arg_type;
+
+#ifdef WIN32
+	extern unsigned int __stdcall fiWorkerFunc(void *arg);
+#else
+	extern void *fiWorkerFunc(void *arg);
+#endif
+
+
+
+#endif
diff --git a/src/gpu_driven/fi_save_blocks_search_workers.cu b/src/gpu_driven/fi_save_blocks_search_workers.cu
new file mode 100644
index 0000000..feefebf
--- /dev/null
+++ b/src/gpu_driven/fi_save_blocks_search_workers.cu
@@ -0,0 +1,767 @@
+
+#include "cuda_runtime.h"
+#include "device_launch_parameters.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include <stdlib.h>
+
+extern "C" {
+#include "../image_misc.h"
+#include "../fi_save_blocks_search_workers.h"
+}
+
+#define IMAGE_MAXCUDABLOCKSIZE 64
+
+static void ShowMeSomeCUDAInfo(void);
+
+typedef struct {
+	unsigned int dx; // Смещение по x доменного блока
+	unsigned int dy; // Смещение по у доменного блока
+	unsigned int tr; // Трансформация
+	unsigned int u; // Контрастность
+	int v; // Яркость
+	unsigned int diff;
+} kernel_block_type;
+
+__device__ void fiCalcReversedTransformationNone(long long &range_sum, long long &domain_sum, long long &rd_sum, long long &domain_disp, int *range_block,
+	unsigned int dx, unsigned int dy,
+	unsigned int rx, unsigned int ry, unsigned int blocksize,
+	unsigned int w, unsigned int h,
+	int *scaled_data_cuda,
+	int *data_cuda
+	)
+{
+	unsigned int l, m;
+
+	rd_sum = domain_disp = domain_sum = range_sum = 0;
+
+	for(l = 0; l < blocksize; l++) {
+		for(m = 0; m < blocksize; m++) {
+			int br = data_cuda[(ry+l)*w+rx+m];
+			int bd = scaled_data_cuda[(dy+l)*w/2+dx+m];
+
+			range_block[l*blocksize+m] = br;
+
+			range_sum += range_block[l*blocksize+m];
+
+			domain_sum += bd;
+			domain_disp += bd*bd;
+			rd_sum += br*bd;
+		}
+	}
+
+	domain_disp = domain_disp*(int)(blocksize*blocksize)-domain_sum*domain_sum;
+}
+
+__device__ void fiCalcReversedTransformation90Right(long long &range_sum, long long &domain_sum, long long &rd_sum, long long &domain_disp, int *range_block,
+	unsigned int dx, unsigned int dy,
+	unsigned int rx, unsigned int ry, unsigned int blocksize,
+	unsigned int w, unsigned int h,
+	int *scaled_data_cuda,
+	int *data_cuda
+	)
+{
+	unsigned int l, m;
+
+	rd_sum = domain_disp = domain_sum = range_sum = 0;
+
+	for(l = 0; l < blocksize; l++) {
+		for(m = 0; m < blocksize; m++) {
+			int br = data_cuda[(ry+m)*w+rx+blocksize-1-l];
+			int bd = scaled_data_cuda[(dy+l)*w/2+dx+m];
+
+			range_block[l*blocksize+m] = br;
+
+			range_sum += range_block[l*blocksize+m];
+
+			domain_sum += bd;
+			domain_disp += bd*bd;
+			rd_sum += br*bd;
+		}
+	}
+
+	domain_disp = domain_disp*(int)(blocksize*blocksize)-domain_sum*domain_sum;
+}
+
+__device__ void fiCalcReversedTransformation180Right(long long &range_sum, long long &domain_sum, long long &rd_sum, long long &domain_disp, int *range_block,
+	unsigned int dx, unsigned int dy,
+	unsigned int rx, unsigned int ry, unsigned int blocksize,
+	unsigned int w, unsigned int h,
+	int *scaled_data_cuda,
+	int *data_cuda
+	)
+{
+	unsigned int l, m;
+
+	rd_sum = domain_disp = domain_sum = range_sum = 0;
+
+	for(l = 0; l < blocksize; l++) {
+		for(m = 0; m < blocksize; m++) {
+			int br = data_cuda[(ry+blocksize-1-l)*w+rx+blocksize-1-m];
+			int bd = scaled_data_cuda[(dy+l)*w/2+dx+m];
+
+			range_block[l*blocksize+m] = br;
+
+			range_sum += range_block[l*blocksize+m];
+
+			domain_sum += bd;
+			domain_disp += bd*bd;
+			rd_sum += br*bd;
+		}
+	}
+
+	domain_disp = domain_disp*(int)(blocksize*blocksize)-domain_sum*domain_sum;
+}
+
+__device__ void fiCalcReversedTransformation270Right(long long &range_sum, long long &domain_sum, long long &rd_sum, long long &domain_disp, int *range_block,
+	unsigned int dx, unsigned int dy,
+	unsigned int rx, unsigned int ry, unsigned int blocksize,
+	unsigned int w, unsigned int h,
+	int *scaled_data_cuda,
+	int *data_cuda
+	)
+{
+	unsigned int l, m;
+
+	rd_sum = domain_disp = domain_sum = range_sum = 0;
+
+	for(l = 0; l < blocksize; l++) {
+		for(m = 0; m < blocksize; m++) {
+			int br = data_cuda[(ry+blocksize-1-m)*w+rx+l];
+			int bd = scaled_data_cuda[(dy+l)*w/2+dx+m];
+
+			range_block[l*blocksize+m] = br;
+
+			range_sum += range_block[l*blocksize+m];
+
+			domain_sum += bd;
+			domain_disp += bd*bd;
+			rd_sum += br*bd;
+		}
+	}
+
+	domain_disp = domain_disp*(int)(blocksize*blocksize)-domain_sum*domain_sum;
+}
+
+__device__ void fiCalcReversedTransformationVFlip(long long &range_sum, long long &domain_sum, long long &rd_sum, long long &domain_disp, int *range_block,
+	unsigned int dx, unsigned int dy,
+	unsigned int rx, unsigned int ry, unsigned int blocksize,
+	unsigned int w, unsigned int h,
+	int *scaled_data_cuda,
+	int *data_cuda
+	)
+{
+	unsigned int l, m;
+
+	rd_sum = domain_disp = domain_sum = range_sum = 0;
+
+	for(l = 0; l < blocksize; l++) {
+		for(m = 0; m < blocksize; m++) {
+			int br = data_cuda[(ry+blocksize-1-l)*w+rx+m];
+			int bd = scaled_data_cuda[(dy+l)*w/2+dx+m];
+
+			range_block[l*blocksize+m] = br;
+
+			range_sum += range_block[l*blocksize+m];
+
+			domain_sum += bd;
+			domain_disp += bd*bd;
+			rd_sum += br*bd;
+		}
+	}
+
+	domain_disp = domain_disp*(int)(blocksize*blocksize)-domain_sum*domain_sum;
+}
+
+__device__ void fiCalcReversedTransformationHFlip(long long &range_sum, long long &domain_sum, long long &rd_sum, long long &domain_disp, int *range_block,
+	unsigned int dx, unsigned int dy,
+	unsigned int rx, unsigned int ry, unsigned int blocksize,
+	unsigned int w, unsigned int h,
+	int *scaled_data_cuda,
+	int *data_cuda
+	)
+{
+	unsigned int l, m;
+
+	rd_sum = domain_disp = domain_sum = range_sum = 0;
+
+	for(l = 0; l < blocksize; l++) {
+		for(m = 0; m < blocksize; m++) {
+			int br = data_cuda[(ry+l)*w+rx+blocksize-1-m];
+			int bd = scaled_data_cuda[(dy+l)*w/2+dx+m];
+
+			range_block[l*blocksize+m] = br;
+
+			range_sum += range_block[l*blocksize+m];
+
+			domain_sum += bd;
+			domain_disp += bd*bd;
+			rd_sum += br*bd;
+		}
+	}
+
+	domain_disp = domain_disp*(int)(blocksize*blocksize)-domain_sum*domain_sum;
+}
+
+__device__ void fiCalcReversedTransformationMainDiagFlip(long long &range_sum, long long &domain_sum, long long &rd_sum, long long &domain_disp, int *range_block,
+	unsigned int dx, unsigned int dy,
+	unsigned int rx, unsigned int ry, unsigned int blocksize,
+	unsigned int w, unsigned int h,
+	int *scaled_data_cuda,
+	int *data_cuda
+	)
+{
+	unsigned int l, m;
+
+	rd_sum = domain_disp = domain_sum = range_sum = 0;
+
+	for(l = 0; l < blocksize; l++) {
+		for(m = 0; m < blocksize; m++) {
+			int br = data_cuda[(ry+m)*w+rx+l];
+			int bd = scaled_data_cuda[(dy+l)*w/2+dx+m];
+
+			range_block[l*blocksize+m] = br;
+
+			range_sum += range_block[l*blocksize+m];
+
+			domain_sum += bd;
+			domain_disp += bd*bd;
+			rd_sum += br*bd;
+		}
+	}
+
+	domain_disp = domain_disp*(int)(blocksize*blocksize)-domain_sum*domain_sum;
+}
+
+__device__ void fiCalcReversedTransformationAntiDiagFlip(long long &range_sum, long long &domain_sum, long long &rd_sum, long long &domain_disp, int *range_block,
+	unsigned int dx, unsigned int dy,
+	unsigned int rx, unsigned int ry, unsigned int blocksize,
+	unsigned int w, unsigned int h,
+	int *scaled_data_cuda,
+	int *data_cuda
+	)
+{
+	unsigned int l, m;
+
+	rd_sum = domain_disp = domain_sum = range_sum = 0;
+
+	for(l = 0; l < blocksize; l++) {
+		for(m = 0; m < blocksize; m++) {
+			int br = data_cuda[(ry+blocksize-1-m)*w+rx+blocksize-1-l];
+			int bd = scaled_data_cuda[(dy+l)*w/2+dx+m];
+
+			range_block[l*blocksize+m] = br;
+
+			range_sum += range_block[l*blocksize+m];
+
+			domain_sum += bd;
+			domain_disp += bd*bd;
+			rd_sum += br*bd;
+		}
+	}
+
+	domain_disp = domain_disp*(int)(blocksize*blocksize)-domain_sum*domain_sum;
+}
+
+__global__ void fiFindBestDomainBlockKernel(unsigned int rx, unsigned int ry,
+	kernel_block_type *kernel_blocks_cuda, unsigned int blocksize,
+	unsigned int w, unsigned int h,
+	int *scaled_data_cuda,
+	int *data_cuda
+	)
+{
+	/*__shared__ */int range_block[IMAGE_MAXCUDABLOCKSIZE*IMAGE_MAXCUDABLOCKSIZE];
+	long long range_sum, domain_sum, rd_sum; // Сумма пикселей рангового блока, доменного блока, пикселя рангового на пиксель доменного блока
+	long long domain_disp;
+	unsigned int diff;
+	int u; int v;
+	unsigned int mono_diff;
+	int mono_u; int mono_v;
+	unsigned int local_diff;
+	__shared__ unsigned int best_diff;
+	int local_u; int local_v;
+	unsigned int dx, dy; // Координаты доменного блока
+	unsigned int tr; // Трансформация, вычисляемая в текущем потоке
+	kernel_block_type *kernel_block_cuda; // Текущий доменный блок для текущего блока cuda
+
+	unsigned int l, m;
+
+	atomicExch(&best_diff, 255*255*blocksize*blocksize*(blockDim.x*blockDim.y)+(blockDim.x*blockDim.y));
+	__syncthreads();
+
+	kernel_block_cuda = kernel_blocks_cuda+blockIdx.z*gridDim.y*gridDim.x+blockIdx.y*gridDim.x+blockIdx.x;
+
+	dx = (blockIdx.x*blockDim.x+threadIdx.x)%(w/2-blocksize+1);
+	dy = (blockIdx.y*blockDim.y+threadIdx.y)%(h/2-blocksize+1);
+	tr = blockIdx.z+threadIdx.z;
+
+	switch(tr) {
+		case IMAGE_RB_TRANSFORMATION_NONE:
+			fiCalcReversedTransformationNone(range_sum, domain_sum, rd_sum, domain_disp, range_block,
+				dx, dy,
+				rx, ry, blocksize,
+				w, h,
+				scaled_data_cuda,
+				data_cuda
+				);
+			break;
+		case IMAGE_RB_TRANSFORMATION_90RIGHT:
+			fiCalcReversedTransformation90Right(range_sum, domain_sum, rd_sum, domain_disp, range_block,
+				dx, dy,
+				rx, ry, blocksize,
+				w, h,
+				scaled_data_cuda,
+				data_cuda
+				);
+			break;
+		case IMAGE_RB_TRANSFORMATION_180RIGHT:
+			fiCalcReversedTransformation180Right(range_sum, domain_sum, rd_sum, domain_disp, range_block,
+				dx, dy,
+				rx, ry, blocksize,
+				w, h,
+				scaled_data_cuda,
+				data_cuda
+				);
+			break;
+		case IMAGE_RB_TRANSFORMATION_270RIGHT:
+			fiCalcReversedTransformation270Right(range_sum, domain_sum, rd_sum, domain_disp, range_block,
+				dx, dy,
+				rx, ry, blocksize,
+				w, h,
+				scaled_data_cuda,
+				data_cuda
+				);
+			break;
+		case IMAGE_RB_TRANSFORMATION_VFLIP:
+			fiCalcReversedTransformationVFlip(range_sum, domain_sum, rd_sum, domain_disp, range_block,
+				dx, dy,
+				rx, ry, blocksize,
+				w, h,
+				scaled_data_cuda,
+				data_cuda
+				);
+			break;
+		case IMAGE_RB_TRANSFORMATION_HFLIP:
+			fiCalcReversedTransformationHFlip(range_sum, domain_sum, rd_sum, domain_disp, range_block,
+				dx, dy,
+				rx, ry, blocksize,
+				w, h,
+				scaled_data_cuda,
+				data_cuda
+				);
+			break;
+		case IMAGE_RB_TRANSFORMATION_MAINDIAGFLIP:
+			fiCalcReversedTransformationMainDiagFlip(range_sum, domain_sum, rd_sum, domain_disp, range_block,
+				dx, dy,
+				rx, ry, blocksize,
+				w, h,
+				scaled_data_cuda,
+				data_cuda
+				);
+			break;
+		case IMAGE_RB_TRANSFORMATION_ANTIDIAGFLIP:
+			fiCalcReversedTransformationAntiDiagFlip(range_sum, domain_sum, rd_sum, domain_disp, range_block,
+				dx, dy,
+				rx, ry, blocksize,
+				w, h,
+				scaled_data_cuda,
+				data_cuda
+				);
+			break;
+	}
+
+	mono_u = 0; mono_v = (int)( ((range_sum/(blocksize*blocksize))*63+128)/255 ); // Значения на случай, если совсем ни один блок не подойдёт
+
+	if(domain_disp) {
+		u = (int)( 32*((int)(blocksize*blocksize)*rd_sum-range_sum*domain_sum)/domain_disp ); // яркость в интервале [0, 1.0), т.е. от [0 до 32)
+		if(u < 0) u = 0; if(u > 31) u = 31;
+		v = (int)( (range_sum-domain_sum*u/32)/(int)(blocksize*blocksize) );
+		if(v < -255 || v > 255) {
+			u = mono_u;
+			v = mono_v;
+		} else {
+			if(v < 0)
+				v = (v*63-128)/255;
+			else
+				v = (v*63+128)/255;
+		}
+	} else {
+		u = mono_u;
+		v = mono_v;
+	}
+
+	mono_diff = 0; // Найдём разницу для mono_v и mono_u (mono_diff)
+	diff = 0; // Найдём разницу для v и u блока dx, dy (diff)
+	for(l = 0; l < blocksize; l++) {
+		for(m = 0; m < blocksize; m++) {
+			int br = range_block[l*blocksize+m];
+			int bd = scaled_data_cuda[(dy+l)*w/2+dx+m];
+
+			int mult;
+
+			mult = (mono_v*255/63-br);
+			mono_diff += mult*mult;
+
+			mult = (bd*u/32+v*255/63-br);
+			diff += mult*mult;
+		}
+	}
+
+	if(diff < mono_diff) {
+		local_diff = diff;
+		local_u = u;
+		local_v = v;
+	} else {
+		local_diff = mono_diff;
+		local_u = mono_u;
+		local_v = mono_v;
+	}
+
+	unsigned int uniq_diff = (unsigned int)sqrtf(local_diff)*(blockDim.x*blockDim.y)+(threadIdx.y*blockDim.x+threadIdx.x); // 255*(IMAGE_MAXCUDABLOCKSIZE=64)*64*64=3FC000 < FFFFFFFF
+	atomicMin(&best_diff, uniq_diff);
+	__syncthreads();
+
+	if(best_diff == uniq_diff) {
+		kernel_block_cuda->dx = dx;
+		kernel_block_cuda->dy = dy;
+		kernel_block_cuda->tr = tr;
+		kernel_block_cuda->u = local_u;
+		kernel_block_cuda->v = local_v;
+		kernel_block_cuda->diff = local_diff;
+	}
+}
+
+static void fiFindBestDomainBlock(unsigned int rx, unsigned int ry,
+	image_rangeblock_type *block_p, kernel_block_type *kernel_blocks_cuda, kernel_block_type *kernel_blocks, unsigned int blocksize,
+	unsigned int w, unsigned int h,
+	dim3 threads, dim3 blocks,
+	int *scaled_data_cuda,
+	int *data_cuda,
+	unsigned int worst_diff
+	)
+{
+	unsigned int best_diff, i;
+	kernel_block_type *kernel_block;
+
+	memset(kernel_blocks, 0, sizeof(kernel_block_type)*blocks.x*blocks.y*blocks.z);
+	cudaMemcpy(kernel_blocks_cuda, kernel_blocks, sizeof(kernel_block_type)*blocks.x*blocks.y*blocks.z, cudaMemcpyHostToDevice);
+
+	fiFindBestDomainBlockKernel <<<blocks, threads>>>(rx, ry, kernel_blocks_cuda, blocksize, w, h, scaled_data_cuda, data_cuda);
+
+	cudaMemcpy(kernel_blocks, kernel_blocks_cuda, sizeof(kernel_block_type)*blocks.x*blocks.y*blocks.z, cudaMemcpyDeviceToHost);
+
+	best_diff = kernel_blocks->diff;
+	kernel_block = kernel_blocks;
+	for(i = 0; i < blocks.x*blocks.y*blocks.z; i++) {
+		if(kernel_blocks[i].diff < best_diff) {
+			best_diff = kernel_blocks[i].diff;
+			kernel_block = kernel_blocks+i;
+
+		}
+		//printf("-? rx %u ry %u i %u dx %u dy %u tr %u l_u %u l_v %d l_diff %u\n", rx, ry, i, kernel_blocks[i].dx, kernel_blocks[i].dy, kernel_blocks[i].tr, kernel_blocks[i].u, kernel_blocks[i].v, kernel_blocks[i].diff);
+	}
+
+	block_p->dx = kernel_block->dx;
+	block_p->dy = kernel_block->dy;
+	block_p->u = kernel_block->u;
+	block_p->v = kernel_block->v;
+	block_p->tr = kernel_block->tr;
+	//printf("-! rx %u ry %u dx %u dy %u tr %u l_u %u l_v %d l_diff %u\n", rx, ry, kernel_block->dx, kernel_block->dy, kernel_block->tr, kernel_block->u, kernel_block->v, kernel_block->diff);
+
+	if(best_diff > worst_diff && blocksize > 2 && blocksize%2 == 0) {
+		printf("best_diff %d/%d (=rms^2*blocksize^2), divided block size %d to %d\n", best_diff, worst_diff, blocksize, blocksize/2);
+
+		fiFindBestDomainBlock(rx, ry, // Позиция нового блока блока
+			(image_rangeblock_type *)block_p->divided_into, kernel_blocks_cuda, kernel_blocks, // Указатель на новый блок, на блоки для вычислений на cuda, память под них на cpu
+			blocksize/2, // Уменьшаем размер блока на 2
+			w, h, threads, blocks, scaled_data_cuda, data_cuda,
+			worst_diff/4); // Уменьшаем разницу на 4 (т.к. новый блок будет в 4 раза меньше)
+
+		fiFindBestDomainBlock(rx+blocksize/2, ry, // Позиция нового блока блока
+			(image_rangeblock_type *)block_p->divided_into+1, kernel_blocks_cuda, kernel_blocks, // Указатель на новый блок, на блоки для вычислений на cuda, память под них на cpu
+			blocksize/2, // Уменьшаем размер блока на 2
+			w, h, threads, blocks, scaled_data_cuda, data_cuda,
+			worst_diff/4); // Уменьшаем разницу на 4 (т.к. новый блок будет в 4 раза меньше)
+
+		fiFindBestDomainBlock(rx, ry+blocksize/2, // Позиция нового блока блока
+			(image_rangeblock_type *)block_p->divided_into+2, kernel_blocks_cuda, kernel_blocks, // Указатель на новый блок, на блоки для вычислений на cuda, память под них на cpu
+			blocksize/2, // Уменьшаем размер блока на 2
+			w, h, threads, blocks, scaled_data_cuda, data_cuda,
+			worst_diff/4); // Уменьшаем разницу на 4 (т.к. новый блок будет в 4 раза меньше)
+
+		fiFindBestDomainBlock(rx+blocksize/2, ry+blocksize/2, // Позиция нового блока блока
+			(image_rangeblock_type *)block_p->divided_into+3, kernel_blocks_cuda, kernel_blocks, // Указатель на новый блок, на блоки для вычислений на cuda, память под них на cpu
+			blocksize/2, // Уменьшаем размер блока на 2
+			w, h, threads, blocks, scaled_data_cuda, data_cuda,
+			worst_diff/4); // Уменьшаем разницу на 4 (т.к. новый блок будет в 4 раза меньше)
+
+		block_p->is_divided = true;
+	}
+}
+
+#ifdef WIN32
+unsigned int __stdcall fiWorkerFunc(void *arg)
+#else
+void *fiWorkerFunc(void *arg)
+#endif
+{
+	unsigned int i;
+	unsigned int rx, ry, channel, copied_channel;
+
+	unsigned char *scaled_data = ((fi_worker_arg_type *)arg)->scaled_data;
+	unsigned char **data = ((fi_worker_arg_type *)arg)->data;
+	int *data_in_int = 0;
+	image_rangeblock_type *block_start_p = ((fi_worker_arg_type *)arg)->block_p, *block_p;
+	unsigned int nof_blocks_per_channel = ((fi_worker_arg_type *)arg)->nof_blocks_per_channel;
+	unsigned int w = ((fi_worker_arg_type *)arg)->w;
+	unsigned int h = ((fi_worker_arg_type *)arg)->h;
+	unsigned int nof_channels = ((fi_worker_arg_type *)arg)->nof_channels;
+	unsigned int alloc_memory = ((fi_worker_arg_type *)arg)->alloc_memory;
+	unsigned int blocksize = ((fi_worker_arg_type *)arg)->blocksize;
+	unsigned int worker_start = ((fi_worker_arg_type *)arg)->worker_start;
+	unsigned int worker_step = ((fi_worker_arg_type *)arg)->worker_step;
+	unsigned int worst_diff = ((fi_worker_arg_type *)arg)->worst_diff;
+
+	unsigned int nof_blocks_per_image = nof_blocks_per_channel*nof_channels;
+
+	int *scaled_data_cuda;
+	kernel_block_type *kernel_blocks_cuda;
+	kernel_block_type *kernel_blocks;
+	int *data_cuda;
+
+	cudaEvent_t start, stop;
+	double cuda_working_time = 0;
+
+	cudaDeviceProp device_prop;
+	dim3 threads, blocks;
+
+	ShowMeSomeCUDAInfo();
+
+	if(blocksize > IMAGE_MAXCUDABLOCKSIZE) {
+		printf("Sorry, but IMAGE_MAXCUDABLOCKSIZE=%d < blocksize=%d\b", IMAGE_MAXCUDABLOCKSIZE, blocksize);
+
+		return 0;
+	}
+
+	printf("cudaMalloc(&scaled_data_cuda, alloc_memory/4=%d)\n", sizeof(int)*alloc_memory/4);
+	if(cudaMalloc(&scaled_data_cuda, sizeof(int)*alloc_memory/4) != cudaSuccess) {
+		printf("Can\'t allocate memory for scaled_data_cuda\n");
+
+		return 0;
+	}
+	printf("cudaMalloc(&data_cuda, alloc_memory=%d)\n", sizeof(int)*alloc_memory);
+	if(cudaMalloc(&data_cuda, sizeof(int)*alloc_memory) != cudaSuccess) {
+		printf("Can\'t allocate memory for data_cuda_p\n");
+
+		cudaFree(scaled_data_cuda);
+
+		return 0;
+	}
+
+	if(cudaEventCreate(&start) != cudaSuccess) {
+		printf("Can\'t create event \"start\"\n");
+
+		cudaFree(scaled_data_cuda);
+		cudaFree(data_cuda);
+
+		return 0;
+	}
+	if(cudaEventCreate(&stop) != cudaSuccess) {
+		printf("Can\'t create event \"stop\"\n");
+
+		cudaFree(scaled_data_cuda);
+		cudaFree(data_cuda);
+
+		cudaEventDestroy(start);
+
+		return 0;
+	}
+
+	cudaGetDeviceProperties(&device_prop, 0);
+
+	int number_of_working_threads = device_prop.maxThreadsPerBlock/2; // Магия
+	int number_of_working_threads_multiplier = 1;
+	int threads_z = 1;
+
+	if(device_prop.maxGridSize[2] < 8) {
+		number_of_working_threads /= 8;
+		threads_z = 8;
+	}
+
+	while( ((unsigned int)(sqrt((float)number_of_working_threads)+0.5)*(unsigned int)(sqrt((float)number_of_working_threads)+0.5)) != number_of_working_threads) {
+		number_of_working_threads /= 2;
+		number_of_working_threads_multiplier *= 2;
+	}
+
+	threads = dim3((unsigned int)(sqrt((float)number_of_working_threads)+0.5)*number_of_working_threads_multiplier, (unsigned int)(sqrt((float)number_of_working_threads)+0.5), threads_z);
+	blocks = dim3(((w/2-blocksize+1)+threads.x-1)/threads.x, ((h/2-blocksize+1)+threads.y-1)/threads.y, 8/threads.z);
+
+	printf("threads(%u,%u,%u) blocks(%u,%u,%u)\n", threads.x, threads.y, threads.z, blocks.x, blocks.y, blocks.z);
+
+	if((int)(threads.x*threads.y*threads.z) > device_prop.maxThreadsPerBlock) {
+		printf("threads.x*threads.y*threads.z is greater than %d, exiting...\n", device_prop.maxThreadsPerBlock);
+
+		cudaFree(scaled_data_cuda);
+		cudaFree(data_cuda);
+
+		cudaEventDestroy(start);
+		cudaEventDestroy(stop);
+
+		return 0;
+	}
+
+	printf("cudaMalloc(&kernel_blocks_cuda, sizeof(image_rangeblock_type)*blocks.x*blocks.y*blocks.z=%d)\n", sizeof(kernel_block_type)*blocks.x*blocks.y*blocks.z);
+	if(cudaMalloc(&kernel_blocks_cuda, sizeof(kernel_block_type)*blocks.x*blocks.y*blocks.z) != cudaSuccess) {
+		printf("Can\'t allocate memory for block_start_cuda_p\n");
+
+		cudaFree(scaled_data_cuda);
+		cudaFree(data_cuda);
+
+		cudaEventDestroy(start);
+		cudaEventDestroy(stop);
+
+		return 0;
+	}
+
+	printf("kernel_blocks = malloc(sizeof(kernel_block_type)*blocks.x*blocks.y*blocks.z=%d), sizeof(kernel_block_type)=%d\n", sizeof(kernel_block_type)*blocks.x*blocks.y*blocks.z, sizeof(kernel_block_type));
+	kernel_blocks = (kernel_block_type *)malloc(sizeof(kernel_block_type)*blocks.x*blocks.y*blocks.z);
+	if(kernel_blocks == 0) {
+		printf("Can\'t allocate memory for block_start_cuda_p\n");
+
+		cudaFree(scaled_data_cuda);
+		cudaFree(data_cuda);
+		cudaFree(kernel_blocks_cuda);
+
+		cudaEventDestroy(start);
+		cudaEventDestroy(stop);
+
+		return 0;
+	}
+
+	copied_channel = nof_channels+1;
+	block_p = block_start_p+worker_start;
+
+	for(i = worker_start; i < nof_blocks_per_image; i += worker_step) {
+		float one_block_time;
+
+		channel = i / nof_blocks_per_channel;
+		rx = ((i % nof_blocks_per_channel) % (w/blocksize)) * blocksize;
+		ry = ((i % nof_blocks_per_channel) / (w/blocksize)) * blocksize;
+
+		if(copied_channel != channel) {
+			unsigned int j;
+
+			if(!data_in_int)
+				data_in_int = (int *)malloc(sizeof(int)*alloc_memory);
+
+			if(!data_in_int) {
+				printf("Can\'t allocate memory for data_in_int\n");
+
+				break;
+			}
+
+			copied_channel = channel;
+
+			for(j = 0; j < alloc_memory/4; j++) {
+				data_in_int[j] = scaled_data[channel*alloc_memory/4+j];
+			}
+			cudaMemcpy(scaled_data_cuda, data_in_int, sizeof(int)*alloc_memory/4, cudaMemcpyHostToDevice);
+
+			for(j = 0; j < alloc_memory; j++) {
+				data_in_int[j] = data[channel][j];
+			}
+			cudaMemcpy(data_cuda, data_in_int, sizeof(int)*alloc_memory, cudaMemcpyHostToDevice);
+		}
+
+		if(((i-worker_start)/worker_step)%10 == 0)
+			printf("worker %d %d rbs (%f) ok\n", worker_start+1, (i-worker_start)/worker_step,
+				(float)( 100.0 * (double)((i-worker_start)/worker_step) / (double)((nof_blocks_per_image-worker_start-1)/worker_step+1) ));
+
+		cudaEventRecord(start, 0);
+
+		fiFindBestDomainBlock(rx, ry,
+			block_p, kernel_blocks_cuda, kernel_blocks, blocksize,
+			w, h,
+			threads, blocks,
+			scaled_data_cuda,
+			data_cuda,
+			worst_diff);
+
+		cudaEventRecord(stop, 0);
+		cudaEventSynchronize(stop);
+		cudaEventElapsedTime(&one_block_time, start, stop);
+		cuda_working_time += one_block_time;
+
+		block_p += worker_step;
+	}
+
+	if(data_in_int)
+		free(data_in_int);
+	cudaFree(scaled_data_cuda);
+	cudaFree(data_cuda);
+	cudaFree(kernel_blocks_cuda);
+	free(kernel_blocks);
+
+	cudaEventDestroy(start);
+	cudaEventDestroy(stop);
+
+	if(worker_step == 1) { // Сбрасываем устройство, только если имеется только один поток
+		if(cudaDeviceReset() != cudaSuccess) {
+			printf("Can\'t reset cuda device\n");
+		}
+	}
+
+	printf("cuda_working_time %f ms\n", cuda_working_time);
+
+	return 0;
+}
+
+static void ShowMeSomeCUDAInfo(void)
+{
+	int device_count;
+	cudaDeviceProp device_prop;
+
+	cudaGetDeviceCount(&device_count);
+
+	printf("Device count: %d\n\n", device_count);
+
+	for(int i = 0; i < device_count; i++) {
+		cudaGetDeviceProperties(&device_prop, i);
+
+		printf("Device name: %s\n", device_prop.name);
+		printf("Compute capability: %d.%d\n", device_prop.major, device_prop.minor);
+		printf("Total constant memory: %lld\n", (long long)(device_prop.totalConstMem));
+		printf("Total global memory: %lld\n", (long long)(device_prop.totalGlobalMem));
+		printf("Shared memory per block: %lld\n", (long long)(device_prop.sharedMemPerBlock));
+		printf("Registers per block: %d\n", device_prop.regsPerBlock);
+		printf("Warp size: %d\n", device_prop.warpSize);
+		printf("Memory pitch: %lld\n", (long long)(device_prop.memPitch));
+		printf("Max threads per block: %d\n", device_prop.maxThreadsPerBlock);
+		printf("Max threads dimensions: x = %d, y = %d, z = %d\n",
+			device_prop.maxThreadsDim[0],
+			device_prop.maxThreadsDim[1],
+			device_prop.maxThreadsDim[2]);
+
+		printf("Max grid size: x = %d, y = %d, z = %d\n",
+			device_prop.maxGridSize[0],
+			device_prop.maxGridSize[1],
+			device_prop.maxGridSize[2]);
+
+		printf("Max Surface1D: %d\n", device_prop.maxSurface1D);
+		printf("Max Surface2D: %d\n", device_prop.maxSurface2D);
+
+		printf("Clock rate: %d\n", device_prop.clockRate);
+		printf("Total constant memory: %d\n", device_prop.totalConstMem);
+		printf("Compute capability: %d.%d\n", device_prop.major, device_prop.minor);
+		printf("Texture alignment: %d\n", device_prop.textureAlignment);
+		printf("Device overlap: %d\n", device_prop.deviceOverlap);
+		printf("Multiprocessor count: %d\n", device_prop.multiProcessorCount);
+		printf("Kernel execution timeout enabled: %s\n",
+			device_prop.kernelExecTimeoutEnabled ? "true" : "false");
+	}
+
+	printf("\n");
+
+}
diff --git a/src/image.h b/src/image.h
new file mode 100644
index 0000000..ebbca84
--- /dev/null
+++ b/src/image.h
@@ -0,0 +1,49 @@
+
+#ifndef _IMAGE_H
+#define _IMAGE_H
+
+#include <stdbool.h>
+
+typedef struct {
+	unsigned int w;
+	unsigned int h;
+	unsigned int nof_channels;
+	unsigned int alloc_memory; // Общий размер массива data
+	unsigned char *data;
+} image_pc_type; // pc == packed channels, каналы идут последовательно для одного пикселя, содержатся в одном массиве
+
+typedef struct {
+	unsigned int w;
+	unsigned int h;
+	unsigned int nof_channels;
+	unsigned int alloc_memory; // Размер массива, выделенного под каждый канал в data
+	unsigned char **data;
+} image_uc_type; // uc == unpacked channels, отдельный массив под каждый канал (красный, зелёный, синий)
+
+typedef struct {
+	unsigned short dx; // Смещение по x доменного блока
+	unsigned short dy; // Смещение по у доменного блока
+	unsigned char tr; // Трансформация
+	unsigned char u; // Контрастность
+	short v; // Яркость
+	void *divided_into; // Указывает на массив из 4-х дочерних блоков
+	bool is_divided; // true, если блок поделён
+	bool must_process;
+} image_rangeblock_type;
+
+#define IMAGE_MAXBLOCKSIZE 128
+
+#define IMAGE_MAXDXY 1023
+#define IMAGE_LHALFDXY 512
+#define IMAGE_HHALFDXY 511
+
+#define IMAGE_RB_TRANSFORMATION_NONE 0
+#define IMAGE_RB_TRANSFORMATION_90RIGHT 1
+#define IMAGE_RB_TRANSFORMATION_180RIGHT 2
+#define IMAGE_RB_TRANSFORMATION_270RIGHT 3
+#define IMAGE_RB_TRANSFORMATION_VFLIP 4
+#define IMAGE_RB_TRANSFORMATION_HFLIP 5
+#define IMAGE_RB_TRANSFORMATION_MAINDIAGFLIP 6
+#define IMAGE_RB_TRANSFORMATION_ANTIDIAGFLIP 7
+
+#endif
diff --git a/src/image_misc.c b/src/image_misc.c
new file mode 100644
index 0000000..e55a182
--- /dev/null
+++ b/src/image_misc.c
@@ -0,0 +1,283 @@
+
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+
+#include "image_misc.h"
+
+void GetDxyBitsSize(unsigned int w, unsigned int h, unsigned int *dxy_bitssize)
+{
+	// Нам нужно такое значение, чтобы поместились цифры от 0 до (w/2-(2-1))*(h/2-(2-1))-1.
+	// В (2-1) число 2 - это размер минимального рангового блока, для которого ищутся доменные блоки
+	// Чтобы найти это значение, надо найти log2(((w/2-1)*(h/2-1)-1)*2)
+	// Например, чтобы сохранить 4, нам нужно 3 бита, 3 = log2(4*2)
+	*dxy_bitssize = (unsigned int)log2(((w/2-1)*(h/2-1)-1)*2);
+}
+
+unsigned int GetCblockBitsSize(unsigned int dxy_bitssize)
+{
+	unsigned int cblock_bitssize;
+
+	//15 = 3+5+7; 3 - поворот, 5 - контраст, 7 - яркость
+	cblock_bitssize = 15+dxy_bitssize;
+
+	return cblock_bitssize;
+}
+
+void SetBlocksPointers(image_rangeblock_type *blocks, unsigned int blocksize, unsigned int nof_blocks, unsigned int w, unsigned int h)
+{
+	unsigned int i, j, k, offset;
+
+	offset = 0;
+	k = nof_blocks;
+	for(i = blocksize; i > 2 && i%2 == 0; i /= 2) {
+		for(j = 0; j < nof_blocks; j++) {
+			blocks[offset+j].divided_into = blocks+k+j*4;
+		}
+		offset = k;
+		nof_blocks *= 4;
+		k += nof_blocks;
+	}
+}
+
+void Scale2to1(unsigned char *src, unsigned char *dst, unsigned int w, unsigned int h)
+{
+	unsigned int i, j, half_w, half_h;
+
+	half_w = w/2;
+	half_h = h/2;
+	
+	for(i = 0; i < half_h; i++) {
+		for(j = 0; j < half_w; j++) {
+			dst[i*half_w+j] = ( (int)src[(i*2)*w+j*2]+(int)src[(i*2)*w+j*2+1]+(int)src[(i*2+1)*w+j*2]+(int)src[(i*2+1)*w+j*2+1] )/4;
+		}
+	}
+}
+
+void ApplyReversedTransformToRangeBlock(unsigned char *src, unsigned char *dst, unsigned int tr, unsigned int blocksize)
+{
+	unsigned int k, l;
+	switch(tr) {
+		case IMAGE_RB_TRANSFORMATION_NONE:
+			memcpy(dst, src, blocksize*blocksize);
+
+			break;
+		case IMAGE_RB_TRANSFORMATION_90RIGHT: // Поворачиваем на 90 влево (т.к. обратная трансформация)
+			for(k = 0; k < blocksize; k++)
+				for(l = 0; l < blocksize; l++) {
+					dst[k*blocksize+l] =  src[l*blocksize+blocksize-1-k];
+				}
+
+			break;
+		case IMAGE_RB_TRANSFORMATION_180RIGHT: // Поворачиваем на 180 влево
+			for(k = 0; k < blocksize; k++)
+				for(l = 0; l < blocksize; l++) {
+					dst[k*blocksize+l] =  src[(blocksize-1-k)*blocksize+blocksize-1-l];
+				}
+
+			break;
+		case IMAGE_RB_TRANSFORMATION_270RIGHT:  // Поворачиваем на 270 влево
+			for(k = 0; k < blocksize; k++)
+				for(l = 0; l < blocksize; l++) {
+					dst[k*blocksize+l] =  src[(blocksize-1-l)*blocksize+k];
+				}
+
+			break;
+		case IMAGE_RB_TRANSFORMATION_VFLIP:
+			for(k = 0; k < blocksize; k++)
+				for(l = 0; l < blocksize; l++) {
+					dst[k*blocksize+l] =  src[(blocksize-1-k)*blocksize+l];
+				}
+
+			break;
+		case IMAGE_RB_TRANSFORMATION_HFLIP:
+			for(k = 0; k < blocksize; k++)
+				for(l = 0; l < blocksize; l++) {
+					dst[k*blocksize+l] =  src[k*blocksize+blocksize-1-l];
+				}
+
+			break;
+		case IMAGE_RB_TRANSFORMATION_MAINDIAGFLIP:
+			for(k = 0; k < blocksize; k++)
+				for(l = 0; l < blocksize; l++) {
+					dst[k*blocksize+l] =  src[l*blocksize+k];
+				}
+
+			break;
+		case IMAGE_RB_TRANSFORMATION_ANTIDIAGFLIP:
+			for(k = 0; k < blocksize; k++)
+				for(l = 0; l < blocksize; l++) {
+					dst[k*blocksize+l] =  src[(blocksize-1-l)*blocksize+blocksize-1-k];
+				}
+
+			break;
+	}
+}
+
+bool UnpackChannels(image_pc_type *in, image_uc_type *out)
+{
+	unsigned int i, channels, alloc_memory;
+
+	out->w = in->w;
+	out->h = in->h;
+	out->nof_channels = channels = in->nof_channels;
+	out->alloc_memory = alloc_memory = in->w*in->h;
+
+	if(channels*alloc_memory != in->alloc_memory)
+		return false;
+
+	out->data = malloc(channels*sizeof(void *));
+	if(!out->data)
+		return false;
+
+	for(i = 0; i < channels; i++) {
+		out->data[i] = malloc(alloc_memory);
+		if(!out->data[i]) {
+			unsigned int j;
+			
+			for(j = 0; j < i; j++)
+				free(out->data[j]);
+			free(out->data);
+			return false;
+		}
+	}
+
+	for(i = 0; i < channels; i++) {
+		unsigned char *p1, *p2;
+		unsigned int j;
+
+		p1 = in->data+i;
+		p2 = out->data[i];
+
+		for(j = 0; j < alloc_memory; j++) {
+			*p2 = *p1;
+			p1 += channels;
+			p2++;
+		}
+	}
+
+	return true;
+}
+
+bool PackChannels(image_uc_type *in, image_pc_type *out)
+{
+	unsigned int i, channels, alloc_memory, in_alloc_memory;
+
+	out->w = in->w;
+	out->h = in->h;
+	out->nof_channels = channels = in->nof_channels;
+	out->alloc_memory = alloc_memory = in->w*in->h*channels;
+	in_alloc_memory = in->alloc_memory;
+
+	if(alloc_memory != channels*in->alloc_memory)
+		return false;
+
+	out->data = malloc(alloc_memory);
+	if(!out->data) return false;
+
+	for(i = 0; i < channels; i++) {
+		unsigned char *p1, *p2;
+		unsigned int j;
+
+		p1 = in->data[i];
+		p2 = out->data+i;
+
+		for(j = 0; j < in_alloc_memory; j++) {
+			*p2 = *p1;
+			p1++;
+			p2 += channels;
+		}
+	}
+
+	return true;
+}
+
+// http://www.w3.org/Graphics/JPEG/jfif3.pdf
+void BGRtoYCBCR(image_uc_type *img)
+{
+	unsigned char *r, *g, *b;
+	unsigned int i;
+
+	if(img->nof_channels < 3)
+		return;
+
+	b = img->data[0];
+	g = img->data[1];
+	r = img->data[2];
+
+	for(i = 0; i < img->w*img->h; i++) {
+		double y, cb, cr;
+
+		y = 0.299*(*r) + 0.587*(*g) + 0.114*(*b);
+		cb = -0.1687*(*r) - 0.3313*(*g) + 0.5*(*b) + 128;
+		cr = 0.5*(*r) - 0.4187*(*g) - 0.0813*(*b) + 128;
+
+		if(y < 0.0)
+			*b = 0;
+		else if(y > 255.0)
+			*b = 255;
+		else
+			*b = (unsigned char)(y+0.5);
+
+		if(cb < 0.0)
+			*g = 0;
+		else if(cb > 255.0)
+			*g = 255;
+		else
+			*g = (unsigned char)(cb+0.5);
+
+		if(cr < 0.0)
+			*r = 0;
+		else if(cr > 255.0)
+			*r = 255;
+		else
+			*r = (unsigned char)(cr+0.5);
+
+		b++; g++; r++;
+	}
+}
+
+// http://www.w3.org/Graphics/JPEG/jfif3.pdf
+void YCBCRtoBGR(image_uc_type *img)
+{
+	unsigned char *y, *cb, *cr;
+	unsigned int i;
+
+	if(img->nof_channels < 3)
+		return;
+
+	y = img->data[0];
+	cb = img->data[1];
+	cr = img->data[2];
+
+	for(i = 0; i < img->w*img->h; i++) {
+		double b, g, r;
+
+		b = (*y)+1.772*((int)(*cb)-128);
+		g = (*y)-0.34414*((int)(*cb)-128)-0.71414*((int)(*cr)-128);
+		r = (*y)+1.402*((int)(*cr)-128);
+
+		if(b < 0.0)
+			*y = 0;
+		else if (b > 255.0)
+			*y = 255;
+		else
+			*y = (unsigned char)(b+0.5);
+
+		if(g < 0.0)
+			*cb = 0;
+		else if (g > 255.0)
+			*cb = 255;
+		else
+			*cb = (unsigned char)(g+0.5);
+
+		if(r < 0.0)
+			*cr = 0;
+		else if (r > 255.0)
+			*cr = 255;
+		else
+			*cr = (unsigned char)(r+0.5);
+
+		y++; cb++; cr++;
+	}
+}
diff --git a/src/image_misc.h b/src/image_misc.h
new file mode 100644
index 0000000..509707a
--- /dev/null
+++ b/src/image_misc.h
@@ -0,0 +1,19 @@
+
+#ifndef _IMAGE_MISC_H
+#define _IMAGE_MISC_H
+
+#include <stdbool.h>
+
+#include "image.h"
+
+extern void GetDxyBitsSize(unsigned int w, unsigned int h, unsigned int *dxy_bitssize);
+extern unsigned int GetCblockBitsSize(unsigned int dxy_bitssize);
+extern void SetBlocksPointers(image_rangeblock_type *blocks, unsigned int blocksize, unsigned int nof_blocks, unsigned int w, unsigned int h);
+extern void Scale2to1(unsigned char *src, unsigned char *dst, unsigned int w, unsigned int h);
+extern void ApplyReversedTransformToRangeBlock(unsigned char *src, unsigned char *dst, unsigned int tr, unsigned int blocksize);
+extern bool UnpackChannels(image_pc_type *in, image_uc_type *out);
+extern bool PackChannels(image_uc_type *in, image_pc_type *out);
+extern void BGRtoYCBCR(image_uc_type *img);
+extern void YCBCRtoBGR(image_uc_type *img);
+
+#endif
diff --git a/src/tga_file.h b/src/tga_file.h
new file mode 100644
index 0000000..3ad9660
--- /dev/null
+++ b/src/tga_file.h
@@ -0,0 +1,29 @@
+
+#ifndef _TGA_FILE_H
+#define _TGA_FILE_H
+
+#pragma pack (push, 1)
+typedef struct {
+	unsigned char IdLeight;		//Длина информации после заголовка
+	unsigned char ColorMap;		//Идентификатор наличия цветовой карты (0 - нет, 1 - есть)
+	unsigned char DataType;		//Тип сжатия
+								//   0 - No Image Data Included
+								//   1 - Uncompressed, Color-mapped Image
+								//   2 - Uncompressed, True-color Image
+								//   3 - Uncompressed, Black-and-white Image
+								//   9 - Run-length encoded, Color-mapped Image
+								//   10 - Run-length encoded, True-color Image
+								//   11 - Run-length encoded, Black-and-white Image
+	unsigned short CmapStart;	//Начало палитры
+	unsigned short CmapLength;	//Длина палитры
+	unsigned char CmapDepth;	//Глубина элементов палитры (15, 16, 24, 32)
+	unsigned short X_Origin;	//Начало изображения по оси X
+	unsigned short Y_Origin;	//Начало изображения по оси Y
+	unsigned short TGAWidth;	//Ширина изображения
+	unsigned short TGAHeight;	//Высота изображения
+	unsigned char BitPerPel;	//Кол-во бит на пиксель (8, 16, 24, 32)
+	unsigned char Description;	//Описание
+} TGAHEADER;
+#pragma pack (pop)
+
+#endif
diff --git a/src/tga_load.c b/src/tga_load.c
new file mode 100644
index 0000000..4b69b6a
--- /dev/null
+++ b/src/tga_load.c
@@ -0,0 +1,237 @@
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "tga_file.h"
+#include "tga_load.h"
+
+static size_t GetFileLength(FILE *f)
+{
+	size_t cur_pos, file_len;
+
+	cur_pos = ftell(f);
+	fseek(f, 0, SEEK_END);
+	file_len = ftell(f);
+	fseek(f, cur_pos, SEEK_SET);
+
+	return file_len;
+}
+
+int tgaLoad(char *fname, image_pc_type *image)
+{
+	FILE *f;
+	TGAHEADER head;
+	unsigned int bpp;
+	unsigned char *pal = 0;
+	unsigned char *databuf = 0, *pdb = 0;
+	unsigned char *temp, *p, *p2, *p3;
+	unsigned char b;
+	unsigned int i, j, k;
+	size_t file_len;
+
+	f = fopen(fname, "rb");
+
+	if(!f)
+		return TGALOAD_CANTOPENFILE;
+
+	file_len = GetFileLength(f);
+
+	if(fread(&head, sizeof(TGAHEADER), 1, f) != 1)
+		return TGALOAD_DAMAGEDFILE;
+
+	switch(head.DataType) { // Проверка правильности\поддержки файла
+		case 1: // Проверяю изображения с палитрой
+		case 9:
+			if((head.ColorMap != 1) || (head.BitPerPel != 8)) {
+				fclose(f);
+				return TGALOAD_DAMAGEDFILE;
+			}
+			if(!((head.CmapDepth == 24) || (head.CmapDepth == 32))) {
+				fclose(f);
+				return TGALOAD_UNSUPPORTEDFILETYPE;
+			}
+			break;
+		case 2: // Проверяю изображения без палитры (24,32 bpp)
+		case 10:
+			if(head.ColorMap != 0) {
+				fclose(f);
+				return TGALOAD_DAMAGEDFILE;
+			}
+			if(!((head.BitPerPel == 24) || (head.BitPerPel == 32))) {
+				fclose(f);
+				return TGALOAD_UNSUPPORTEDFILETYPE;
+			}
+			break;
+		case 3: // Проверяю изображения без палитры (8 bpp, greyscale)
+		case 11:
+			if(head.ColorMap != 0) {
+				fclose(f);
+				return TGALOAD_DAMAGEDFILE;
+			}
+			if(head.BitPerPel != 8) {
+				fclose(f);
+				return TGALOAD_UNSUPPORTEDFILETYPE;
+			}
+			break;
+		default:
+			fclose(f);
+			return TGALOAD_UNSUPPORTEDFILETYPE;
+	}
+
+	if((head.BitPerPel == 8) && (head.ColorMap == 0)) { // Настройка для greyscale-изображений
+		bpp = 1;
+	} else if((head.BitPerPel == 32) || ((head.BitPerPel == 8) && (head.CmapDepth == 32))) { // Настройка 32bpp изображений (и палитрой или без)
+		bpp = 4;
+	} else { // Настройка 24bpp изображений (и палитрой или без)
+		bpp = 3;
+	}
+
+	image->nof_channels = bpp;
+	image->w = head.TGAWidth;
+	image->h = head.TGAHeight;
+
+	image->alloc_memory = image->w*image->h*bpp;
+	image->data = malloc(image->alloc_memory);
+	if(!image->data) {
+		fclose(f);
+		return TGALOAD_MEMORYALLOCERROR;
+	}
+	if(head.ColorMap == 1) {
+		pal = malloc(head.CmapLength*bpp);
+		if(!pal) {
+			free(image->data);
+			fclose(f);
+			return TGALOAD_MEMORYALLOCERROR;
+		}
+		fread(pal, 1, head.CmapLength*bpp, f);
+	}
+
+	// Пропускаю идентификатор
+	fseek(f, head.IdLeight, SEEK_CUR);
+
+	switch(head.DataType) { // Чтение изображения
+		case 1: // Чтение изображения с палитрой
+			temp = malloc(image->w*image->h);
+			if(!temp) {
+				free(image->data);
+				free(pal);
+				return TGALOAD_MEMORYALLOCERROR;
+			}
+			fread(temp, 1, image->w*image->h, f);
+			p = image->data;
+			p3 = temp;
+			for(i = 0;i < image->w*image->h;i++) {
+				p2 = pal+((head.CmapStart+*p3)*bpp);
+				*p = *p2; p++; p2++;
+				*p = *p2; p++; p2++;
+				*p = *p2; p++;
+				if(bpp == 4) {  p2++; *p = *p2; p++; }
+				p3++;
+			}
+			free(temp);
+			break;
+		case 2: // Чтение 8(greyscale),24,32bit изображений
+		case 3:
+			fread(image->data, 1, image->alloc_memory, f);
+			break;
+		case 9: // Декодирование изображения с палитрой
+			databuf = malloc(file_len-ftell(f));
+			if(!databuf) {
+				free(image->data);
+				free(pal);
+				fclose(f);
+				return TGALOAD_MEMORYALLOCERROR;
+			}
+			fread(databuf, 1, file_len-ftell(f), f);
+
+			pdb = databuf;
+			i = 0;
+			p = image->data;
+			while(i < image->w*image->h) {
+				b = *pdb; pdb++;
+				if((i+(b&0x7F)+1) > image->w*image->h) {
+					free(image->data);
+					break; // Часть файла не удалось прочитать
+				}
+				if(b & 0x80) { // the packet is a Run-length Packet
+					for(j = 0;j < (unsigned int)((b&0x7F)+1);j++) {
+						p2 = pal+((head.CmapStart+(*pdb))*bpp);
+						*p = *p2; p++; p2++;
+						*p = *p2; p++; p2++;
+						*p = *p2; p++; p2++;
+						if(bpp == 4) { *p = *p2; p++; p2++; }
+					}
+					pdb++;
+				} else { // the packet is a Raw Packet
+					for(j = 0;j < (unsigned int)((b&0x7F)+1);j++) {
+						p2 = pal+((head.CmapStart+pdb[j])*bpp);
+						*p = *p2; p++; p2++;
+						*p = *p2; p++; p2++;
+						*p = *p2; p++; p2++;
+						if(bpp == 4) { *p = *p2; p++; p2++; }
+					}
+					pdb += (b&0x7F)+1;
+				}
+				i += (b&0x7F)+1;
+			}
+			free(databuf);
+			break;
+		case 10: // Чтение 8(greyscale),24,32bit изображений
+		case 11:
+			databuf = malloc(file_len-ftell(f));
+			if(!databuf) {
+				free(image->data);
+				free(pal);
+				fclose(f);
+				return TGALOAD_MEMORYALLOCERROR;
+			}
+			fread(databuf, 1, file_len-ftell(f), f);
+
+			pdb = databuf;
+			i = 0;
+			p = image->data;
+			while(i < image->w*image->h) {
+				b = *pdb; pdb++;
+				if((i+(b&0x7F)+1) > image->w*image->h) {
+					free(databuf); 
+					break; // Часть файла не удалось прочитать
+				}
+				if(b & 0x80) { // the packet is a Run-length Packet
+					for(j = 0;j < (unsigned int)((b&0x7F)+1);j++) {
+						p2 = pdb;
+						for(k = 0;k < bpp;k++) {
+							*p = *p2; p++; p2++; }
+					}
+					pdb += bpp;
+				} else { // the packet is a Raw Packet
+					memcpy(p, pdb, bpp*((b&0x7F)+1)); pdb += bpp*((b&0x7F)+1); p += bpp*((b&0x7F)+1);
+				}
+				i += (b&0x7F)+1;
+			}
+			free(databuf);
+			break;
+	}
+
+	// Переворот по оси y (если необходимо, гимп может сохранять с этим флагом)
+	if(head.Description & 0x20) {
+		p = image->data;
+		temp = malloc(image->w*bpp);
+		if(temp) { // Можно здесь завершить работу функции, если !temp -^_^-
+			p2 = &image->data[image->w*bpp*(image->h-1)];
+			for(i = 0;i < image->h/2;i++) {
+				memcpy(temp,p,image->w*bpp);
+				memcpy(p,p2,image->w*bpp);
+				memcpy(p2,temp,image->w*bpp);
+				p += image->w*bpp;
+				p2 -= image->w*bpp;
+			}
+			free(temp);
+		}
+	}
+
+	if(head.ColorMap == 1) free(pal);
+	fclose(f);
+
+	return TGALOAD_OKAY;
+}
diff --git a/src/tga_load.h b/src/tga_load.h
new file mode 100644
index 0000000..b956cb6
--- /dev/null
+++ b/src/tga_load.h
@@ -0,0 +1,15 @@
+
+#ifndef _TGA_LOAD_H
+#define _TGA_LOAD_H
+
+#include "image.h"
+
+#define TGALOAD_OKAY 0
+#define TGALOAD_CANTOPENFILE 1
+#define TGALOAD_DAMAGEDFILE 2
+#define TGALOAD_UNSUPPORTEDFILETYPE 3
+#define TGALOAD_MEMORYALLOCERROR 4
+
+extern int tgaLoad(char *fname, image_pc_type *image);
+
+#endif
diff --git a/src/tga_save.c b/src/tga_save.c
new file mode 100644
index 0000000..e790cbf
--- /dev/null
+++ b/src/tga_save.c
@@ -0,0 +1,40 @@
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "tga_file.h"
+#include "tga_save.h"
+
+int tgaSave(char *fname, image_pc_type *image)
+{
+	FILE *f;
+	TGAHEADER head;
+
+	f = fopen(fname, "wb");
+	if(!f)
+		return TGASAVE_CANTOPENFILE;
+
+	if(image->w*image->h*image->nof_channels != image->alloc_memory)
+		return TGASAVE_DAMAGEDIMAGESTRUCT;
+
+	if(image->nof_channels > 4 || image->nof_channels == 2 || image->nof_channels == 0)
+		return TGASAVE_TOOMANYCHANNELS;
+
+	memset(&head, 0, sizeof(TGAHEADER));
+
+	head.TGAWidth = image->w;
+	head.TGAHeight = image->h;
+	if(image->nof_channels == 1)
+		head.DataType = 3;
+	else
+		head.DataType = 2;
+	head.BitPerPel = image->nof_channels*8;
+
+	fwrite(&head, sizeof(head), 1, f);
+	fwrite(image->data, 1, image->alloc_memory, f);
+
+	fclose(f);
+
+	return TGASAVE_OKAY;
+}
diff --git a/src/tga_save.h b/src/tga_save.h
new file mode 100644
index 0000000..b7b379d
--- /dev/null
+++ b/src/tga_save.h
@@ -0,0 +1,14 @@
+
+#ifndef _TGA_SAVE_H
+#define _TGA_SAVE_H
+
+#include "image.h"
+
+#define TGASAVE_OKAY 0
+#define TGASAVE_CANTOPENFILE 1
+#define TGASAVE_DAMAGEDIMAGESTRUCT 2
+#define TGASAVE_TOOMANYCHANNELS 3
+
+extern int tgaSave(char *fname, image_pc_type *image);
+
+#endif
diff --git a/test0.bat b/test0.bat
new file mode 100644
index 0000000..d2293d3
--- /dev/null
+++ b/test0.bat
@@ -0,0 +1,16 @@
+Release\compress.exe chihiro2.tga chihiro2_b8.fi 8
+Release\decompress.exe chihiro2_b8.fi chihiro2_b8.dec.tga
+Release\compress.exe chihiro2.tga chihiro2_b4.fi 4
+Release\decompress.exe chihiro2_b4.fi chihiro2_b4.dec.tga
+Release\compress.exe chihiro2.tga chihiro2_b2.fi 2
+Release\decompress.exe chihiro2_b2.fi chihiro2_b2.dec.tga
+Release\compress.exe chihiro2.tga chihiro2_b8_rms2.fi 8 4 2
+Release\decompress.exe chihiro2_b8_rms2.fi chihiro2_b8_rms2.dec.tga
+Release\compress.exe chihiro2.tga chihiro2_b8_rms4.fi 8 4 4
+Release\decompress.exe chihiro2_b8_rms4.fi chihiro2_b8_rms4.dec.tga
+Release\compress.exe chihiro2.tga chihiro2_b8_rms6.fi 8 4 6
+Release\decompress.exe chihiro2_b8_rms6.fi chihiro2_b8_rms6.dec.tga
+Release\compress.exe chihiro2.tga chihiro2_b8_rms10.fi 8 4 10
+Release\decompress.exe chihiro2_b8_rms10.fi chihiro2_b8_rms10.dec.tga
+Release\compress.exe chihiro2.tga chihiro2_b8_rms50.fi 8 4 50
+Release\decompress.exe chihiro2_b8_rms50.fi chihiro2_b8_rms50.dec.tga
\ No newline at end of file
diff --git a/test0.md5 b/test0.md5
new file mode 100644
index 0000000..7836c1d
--- /dev/null
+++ b/test0.md5
@@ -0,0 +1,16 @@
+dfe6421540d4ad169c23498bc5b4aa29 *chihiro2_b2.fi
+bac1dc505a5c9be23b27b8e840f5f8f9 *chihiro2_b2.dec.tga
+ce34f55126ee2f980868d08ac1239f68 *chihiro2_b4.fi
+e2037641abf7954b1513a5ebe53506eb *chihiro2_b4.dec.tga
+eb63ca99a084983b91f6414a7e3c241e *chihiro2_b8.fi
+25455b476b7f29b83e803e4a14ab3b19 *chihiro2_b8.dec.tga
+08b69485bfdaefe96c8cb478594f4a0f *chihiro2_b8_rms10.fi
+fe97f99bb573263f31498d3bdd3c19d3 *chihiro2_b8_rms10.dec.tga
+4584a2eb7932474a2b89978aa4b9dce5 *chihiro2_b8_rms2.fi
+1e242b76cc759de203f12ce3e2cca486 *chihiro2_b8_rms2.dec.tga
+d92c596ef1cec8f68616ef948284cae4 *chihiro2_b8_rms4.fi
+a4ab1ee520223fe616069e7e1a2aa3ba *chihiro2_b8_rms4.dec.tga
+035e900148cb8c6945a65cb1669c75af *chihiro2_b8_rms50.fi
+382986d755fad7ec1202776fd628c866 *chihiro2_b8_rms50.dec.tga
+f0515cd41769618ead8867b876772d9c *chihiro2_b8_rms6.fi
+8e84cc78593104f92dc4e00d7550e1f3 *chihiro2_b8_rms6.dec.tga
diff --git a/unix_makefiles/Release/do_not_delete.txt b/unix_makefiles/Release/do_not_delete.txt
new file mode 100644
index 0000000..e69de29
diff --git a/unix_makefiles/ReleaseMPI/do_not_delete.txt b/unix_makefiles/ReleaseMPI/do_not_delete.txt
new file mode 100644
index 0000000..e69de29
diff --git a/unix_makefiles/makefile b/unix_makefiles/makefile
new file mode 100644
index 0000000..c9e341c
--- /dev/null
+++ b/unix_makefiles/makefile
@@ -0,0 +1,29 @@
+CC=gcc
+CPP=g++
+CFLAGS=-c -Wall -O2
+CFLAGS_MPI=$(CFLAGS) -DI_USE_MPI -I/usr/include/mpich2/
+SOURCES_COMPRESS=../src/arrays.c ../src/compress.c ../src/fi_save.c ../src/fi_save_blocks_compression.c ../src/fi_save_blocks_search.c ../src/image_misc.c ../src/tga_load.c ../src/cpu_driven/fi_save_blocks_search_workers.c
+OBJECTS_COMPRESS=arrays.o compress.o fi_save.o fi_save_blocks_compression.o fi_save_blocks_search.o image_misc.o tga_load.o fi_save_blocks_search_workers.o
+SOURCES_DECOMPRESS=../src/arrays.c ../src/decompress.c ../src/fi_load.c ../src/image_misc.c ../src/tga_save.c
+OBJECTS_DECOMPRESS=arrays.o decompress.o fi_load.o image_misc.o tga_save.o
+
+all: release release_mpi
+
+release: compress decompress
+
+release_mpi: compress_mpi
+
+compress:
+	$(CC) $(CFLAGS) $(SOURCES_COMPRESS)
+	$(CPP) -o Release/compress -lpthread $(OBJECTS_COMPRESS) -s
+	
+compress_mpi:
+	$(CC) $(CFLAGS_MPI) $(SOURCES_COMPRESS)
+	$(CPP) -o ReleaseMPI/compress -lpthread -lmpi $(OBJECTS_COMPRESS) -s
+
+decompress:
+	$(CC) $(CFLAGS) $(SOURCES_DECOMPRESS)
+	$(CPP) -o Release/decompress $(OBJECTS_DECOMPRESS) -s
+
+clean:
+	rm -rf *.o Release/compress Release/decompress ReleaseMPI/compress
diff --git a/unix_projects/compress/compress.cbp b/unix_projects/compress/compress.cbp
new file mode 100644
index 0000000..057809c
--- /dev/null
+++ b/unix_projects/compress/compress.cbp
@@ -0,0 +1,83 @@
+<?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
+<CodeBlocks_project_file>
+	<FileVersion major="1" minor="6" />
+	<Project>
+		<Option title="compress" />
+		<Option pch_mode="2" />
+		<Option compiler="gcc" />
+		<Build>
+			<Target title="Debug">
+				<Option output="../bin/Debug/compress" prefix_auto="1" extension_auto="1" />
+				<Option object_output="../obj/Debug/" />
+				<Option type="1" />
+				<Option compiler="gcc" />
+				<Compiler>
+					<Add option="-g" />
+				</Compiler>
+			</Target>
+			<Target title="Release">
+				<Option output="../bin/Release/compress" prefix_auto="1" extension_auto="1" />
+				<Option object_output="../obj/Release/" />
+				<Option type="1" />
+				<Option compiler="gcc" />
+				<Compiler>
+					<Add option="-O2" />
+				</Compiler>
+				<Linker>
+					<Add option="-s" />
+				</Linker>
+			</Target>
+			<Target title="Release (MPI)">
+				<Option output="../bin/ReleaseMPI/compress" prefix_auto="1" extension_auto="1" />
+				<Option object_output="../obj/ReleaseMPI/" />
+				<Option type="1" />
+				<Option compiler="gcc" />
+				<Compiler>
+					<Add option="-O2" />
+					<Add option="-DI_USE_MPI" />
+					<Add directory="/usr/include/mpich2/" />
+				</Compiler>
+				<Linker>
+					<Add option="-s" />
+					<Add library="mpi" />
+				</Linker>
+			</Target>
+		</Build>
+		<Compiler>
+			<Add option="-Wall" />
+		</Compiler>
+		<Linker>
+			<Add library="pthread" />
+		</Linker>
+		<Unit filename="../../src/arrays.c">
+			<Option compilerVar="CC" />
+		</Unit>
+		<Unit filename="../../src/compress.c">
+			<Option compilerVar="CC" />
+		</Unit>
+		<Unit filename="../../src/cpu_driven/fi_save_blocks_search_workers.c">
+			<Option compilerVar="CC" />
+		</Unit>
+		<Unit filename="../../src/fi_save.c">
+			<Option compilerVar="CC" />
+		</Unit>
+		<Unit filename="../../src/fi_save_blocks_compression.c">
+			<Option compilerVar="CC" />
+		</Unit>
+		<Unit filename="../../src/fi_save_blocks_search.c">
+			<Option compilerVar="CC" />
+		</Unit>
+		<Unit filename="../../src/image_misc.c">
+			<Option compilerVar="CC" />
+		</Unit>
+		<Unit filename="../../src/tga_load.c">
+			<Option compilerVar="CC" />
+		</Unit>
+		<Extensions>
+			<envvars />
+			<code_completion />
+			<debugger />
+			<lib_finder disable_auto="1" />
+		</Extensions>
+	</Project>
+</CodeBlocks_project_file>
diff --git a/unix_projects/compress/compress.depend b/unix_projects/compress/compress.depend
new file mode 100644
index 0000000..782fb9a
--- /dev/null
+++ b/unix_projects/compress/compress.depend
@@ -0,0 +1,107 @@
+# depslib dependency file v1.0
+1447453270 source:/media/CC01-601D/fractal_image_compression/src/arrays.c
+	"arrays.h"
+
+1447378552 /media/CC01-601D/fractal_image_compression/src/arrays.h
+	<stdbool.h>
+
+1456175332 source:/media/CC01-601D/fractal_image_compression/src/compress.c
+	<stdio.h>
+	<stdlib.h>
+	<math.h>
+	<ctype.h>
+	<mpi.h>
+	"tga_load.h"
+	"image_misc.h"
+	"fi_save.h"
+
+1446493988 /media/CC01-601D/fractal_image_compression/src/tga_load.h
+	"image.h"
+
+1455143304 /media/CC01-601D/fractal_image_compression/src/image.h
+	<stdbool.h>
+
+1454630460 /media/CC01-601D/fractal_image_compression/src/image_misc.h
+	<stdbool.h>
+	"image.h"
+
+1447435442 /media/CC01-601D/fractal_image_compression/src/fi_save.h
+	"image.h"
+
+1457447856 source:/media/CC01-601D/fractal_image_compression/src/cpu_driven/fi_save_blocks_search_workers.c
+	<stdio.h>
+	<string.h>
+	"../image_misc.h"
+	"../fi_save_blocks_search_workers.h"
+
+1456176240 /media/CC01-601D/fractal_image_compression/src/fi_save_blocks_search_workers.h
+	"image.h"
+
+1454626304 source:/media/CC01-601D/fractal_image_compression/src/fi_save_blocks_compression.c
+	<stdio.h>
+	<stdlib.h>
+	<string.h>
+	"arrays.h"
+	"fi_file.h"
+	"fi_save.h"
+	"image_misc.h"
+	"fi_save_blocks_compression.h"
+
+1446855158 /media/CC01-601D/fractal_image_compression/src/fi_file.h
+
+1454626284 /media/CC01-601D/fractal_image_compression/src/fi_save_blocks_compression.h
+
+1456175564 source:/media/CC01-601D/fractal_image_compression/src/fi_save.c
+	<stdio.h>
+	<stdlib.h>
+	<string.h>
+	<time.h>
+	<mpi.h>
+	"arrays.h"
+	"fi_file.h"
+	"fi_save.h"
+	"fi_save_blocks_compression.h"
+	"fi_save_blocks_search.h"
+	"image_misc.h"
+
+1454624988 /media/CC01-601D/fractal_image_compression/src/fi_save_blocks_search.h
+	"image.h"
+
+1454630464 source:/media/CC01-601D/fractal_image_compression/src/image_misc.c
+	<stdlib.h>
+	<string.h>
+	<math.h>
+	"image_misc.h"
+
+1446611444 source:/media/CC01-601D/fractal_image_compression/src/tga_load.c
+	<stdio.h>
+	<stdlib.h>
+	<string.h>
+	"tga_file.h"
+	"tga_load.h"
+
+1446489332 /media/CC01-601D/fractal_image_compression/src/tga_file.h
+
+1456177190 source:/media/CC01-601D/fractal_image_compression/src/fi_save_blocks_search.c
+	<stdio.h>
+	<string.h>
+	<stdlib.h>
+	<Windows.h>
+	<process.h>
+	<pthread.h>
+	<mpi.h>
+	"image_misc.h"
+	"fi_save.h"
+	"fi_save_blocks_search.h"
+	"fi_save_blocks_search_workers.h"
+
+1365770559 /usr/include/mpich2//mpi.h
+	"mpio.h"
+	"mpicxx.h"
+
+1365770561 /usr/include/mpich2//mpio.h
+	"mpi.h"
+
+1365770562 /usr/include/mpich2//mpicxx.h
+	<stdio.h>
+
diff --git a/unix_projects/compress/compress.layout b/unix_projects/compress/compress.layout
new file mode 100644
index 0000000..42ad001
--- /dev/null
+++ b/unix_projects/compress/compress.layout
@@ -0,0 +1,22 @@
+<?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
+<CodeBlocks_layout_file>
+	<ActiveTarget name="Release (MPI)" />
+	<File name="../../src/arrays.c" open="0" top="0" tabpos="0">
+		<Cursor position="419" topLine="3" />
+	</File>
+	<File name="../../src/compress.c" open="1" top="0" tabpos="1">
+		<Cursor position="102" topLine="7" />
+	</File>
+	<File name="../../src/cpu_driven/fi_save_blocks_search_workers.c" open="0" top="0" tabpos="2">
+		<Cursor position="6248" topLine="178" />
+	</File>
+	<File name="../../src/fi_save.c" open="1" top="0" tabpos="2">
+		<Cursor position="741" topLine="18" />
+	</File>
+	<File name="../../src/fi_save_blocks_compression.c" open="1" top="1" tabpos="3">
+		<Cursor position="0" topLine="0" />
+	</File>
+	<File name="../../src/fi_save_blocks_search.c" open="1" top="0" tabpos="4">
+		<Cursor position="5069" topLine="140" />
+	</File>
+</CodeBlocks_layout_file>
diff --git a/unix_projects/decompress/decompress.cbp b/unix_projects/decompress/decompress.cbp
new file mode 100644
index 0000000..4a6c1c5
--- /dev/null
+++ b/unix_projects/decompress/decompress.cbp
@@ -0,0 +1,56 @@
+<?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
+<CodeBlocks_project_file>
+	<FileVersion major="1" minor="6" />
+	<Project>
+		<Option title="decompress" />
+		<Option pch_mode="2" />
+		<Option compiler="gcc" />
+		<Build>
+			<Target title="Debug">
+				<Option output="../bin/Debug/decompress" prefix_auto="1" extension_auto="1" />
+				<Option object_output="../obj/Debug/" />
+				<Option type="1" />
+				<Option compiler="gcc" />
+				<Compiler>
+					<Add option="-g" />
+				</Compiler>
+			</Target>
+			<Target title="Release">
+				<Option output="../bin/Release/decompress" prefix_auto="1" extension_auto="1" />
+				<Option object_output="../obj/Release/" />
+				<Option type="1" />
+				<Option compiler="gcc" />
+				<Compiler>
+					<Add option="-O2" />
+				</Compiler>
+				<Linker>
+					<Add option="-s" />
+				</Linker>
+			</Target>
+		</Build>
+		<Compiler>
+			<Add option="-Wall" />
+		</Compiler>
+		<Unit filename="../../src/arrays.c">
+			<Option compilerVar="CC" />
+		</Unit>
+		<Unit filename="../../src/decompress.c">
+			<Option compilerVar="CC" />
+		</Unit>
+		<Unit filename="../../src/fi_load.c">
+			<Option compilerVar="CC" />
+		</Unit>
+		<Unit filename="../../src/image_misc.c">
+			<Option compilerVar="CC" />
+		</Unit>
+		<Unit filename="../../src/tga_save.c">
+			<Option compilerVar="CC" />
+		</Unit>
+		<Extensions>
+			<envvars />
+			<code_completion />
+			<debugger />
+			<lib_finder disable_auto="1" />
+		</Extensions>
+	</Project>
+</CodeBlocks_project_file>
diff --git a/unix_projects/decompress/decompress.depend b/unix_projects/decompress/decompress.depend
new file mode 100644
index 0000000..c4ac310
--- /dev/null
+++ b/unix_projects/decompress/decompress.depend
@@ -0,0 +1 @@
+# depslib dependency file v1.0
diff --git a/unix_projects/decompress/decompress.layout b/unix_projects/decompress/decompress.layout
new file mode 100644
index 0000000..0e578cf
--- /dev/null
+++ b/unix_projects/decompress/decompress.layout
@@ -0,0 +1,13 @@
+<?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
+<CodeBlocks_layout_file>
+	<ActiveTarget name="Release" />
+	<File name="../../src/arrays.c" open="1" top="0" tabpos="1">
+		<Cursor position="0" topLine="0" />
+	</File>
+	<File name="../../src/decompress.c" open="0" top="0" tabpos="0">
+		<Cursor position="1667" topLine="53" />
+	</File>
+	<File name="../../src/fi_load.c" open="1" top="0" tabpos="2">
+		<Cursor position="6689" topLine="165" />
+	</File>
+</CodeBlocks_layout_file>
diff --git a/unix_projects/fractal_image_compression.workspace b/unix_projects/fractal_image_compression.workspace
new file mode 100644
index 0000000..a217cdb
--- /dev/null
+++ b/unix_projects/fractal_image_compression.workspace
@@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
+<CodeBlocks_workspace_file>
+	<Workspace title="Workspace">
+		<Project filename="decompress/decompress.cbp" />
+		<Project filename="compress/compress.cbp" active="1" />
+	</Workspace>
+</CodeBlocks_workspace_file>