CIS565-Fall-2016 · gabenaghi · Nov 2, 2016 · Nov 3, 2016 · Nov 3, 2016 · Nov 4, 2016
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -0,0 +1,34 @@
+cmake_minimum_required(VERSION 3.0)
+
+project(cis565_parallel_fft)
+
+set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake" ${CMAKE_MODULE_PATH})
+
+# Enable C++11 for host code
+set(CMAKE_CXX_STANDARD 11)
+
+list(APPEND CUDA_NVCC_FLAGS_DEBUG -G -g)
+list(APPEND CUDA_NVCC_FLAGS_RELWITHDEBUGINFO -lineinfo)
+
+# Crucial magic for CUDA linking
+find_package(Threads REQUIRED)
+find_package(CUDA REQUIRED)
+
+set(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE ON)
+set(CUDA_SEPARABLE_COMPILATION ON)
+
+if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
+    set(CUDA_PROPAGATE_HOST_FLAGS OFF)
+endif()
+
+include_directories(.)
+add_subdirectory(parallel_fft)
+
+cuda_add_executable(${CMAKE_PROJECT_NAME}
+    "src/main.cpp"
+    )
+
+target_link_libraries(${CMAKE_PROJECT_NAME}
+    parallel_fft
+    ${CORELIBS}
+    )
diff --git a/INSTRUCTION.md b/INSTRUCTION.md
diff --git a/README.md b/README.md
@@ -1,33 +1,117 @@
-WebGL Deferred Shading
+Parallel Fast Fourier Transform
 ======================
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 5**
 
-* (TODO) YOUR NAME HERE
-* Tested on: (TODO) **Google Chrome 222.2** on
-  Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Gabriel Naghi
+* Tested on: 
+ - CPU implementation: Linux OpenSUSE, Intel Xeon E5-2470 @ 2.4 GHz, 32 GB RAM (Eniac)
+ - GPU Implementation: Windows 10, Intel Core i7-2600 @ 3.4 GHz, 8 GB RAM, GeForce GT 730 1024 MB (DSL)
 
-### Live Online
+##Fourier Transforms
+Fourier Transforms define a process by which to trasnform a signal from the time domain to the frequency ("forward transform") and vice versa ("inverse transform"). Fourier Transforms rely on the principle that any signal can be represented as magnitude and phase as a function of frequency.  
 
-[![](img/thumb.png)](http://TODO.github.io/Project5B-WebGL-Deferred-Shading)
+![](img/Fourier_unit_pulse.png)
+Source: Wikipedia
 
-### Demo Video/GIF
+Representing a signal in the freqency domain is useful for a few reasons. First and foremost, it tells you what frequencies are present in the signal, and in what proportions. Another important use is that a multiplication in frequency domain is equivalent to a convolution in the time domain. It is generally easier to transform and multiply than compute a convolution. A similar optimization exists with regard to cross-correlations. It is also much easier to compute the n-th derivative of a function in the frequency domain than in the time domain. There are many other uses of Fourier Transforms (see discussion [here](http://dsp.stackexchange.com/questions/69/why-is-the-fourier-transform-so-important)).
 
-[![](img/video.png)](TODO)
+##Discrete Fourier Transforms
 
-### (TODO: Your README)
+In practice, Discrete Fourier Transforms (DFT) are used. This means that samples are of finite quantity and are equally spaced over time. The transform occurs by correlating each sample with with analyzing functions in the form of sinusoids. Of course, this produces high coefficients when the sample is similar and low amplitudes when dissimilar. 
 
-*DO NOT* leave the README to the last minute! It is a crucial part of the
-project, and we will not be able to grade you without a good README.
+In general the formula for computing a given frequency bucket in a DFT is as follows: 
 
-This assignment has a considerable amount of performance analysis compared
-to implementation work. Complete the implementation early to leave time!
+![](img/dft.png)
+Source: http://www.cmlab.csie.ntu.edu.tw/cml/dsp/training/coding/transform/fft.html
 
+This results in an O(N^2) algorithm. We can do much better.
 
-### Credits
 
-* [Three.js](https://github.com/mrdoob/three.js) by [@mrdoob](https://github.com/mrdoob) and contributors
-* [stats.js](https://github.com/mrdoob/stats.js) by [@mrdoob](https://github.com/mrdoob) and contributors
-* [webgl-debug](https://github.com/KhronosGroup/WebGLDeveloperTools) by Khronos Group Inc.
-* [glMatrix](https://github.com/toji/gl-matrix) by [@toji](https://github.com/toji) and contributors
-* [minimal-gltf-loader](https://github.com/shrekshao/minimal-gltf-loader) by [@shrekshao](https://github.com/shrekshao)
+##Fast Fourier Transforms
+Originally discovered by Carl Friedrich Gauss, and re-popularized by J.W. Cooley and John Tukey in the 20th century, the Fast Fourier Transform exploits two properties to reduce the number of elemetary operations:
+
+![](img/properties.png)
+Source: http://www.cmlab.csie.ntu.edu.tw/cml/dsp/training/coding/transform/fft.html
+
+In short, this allows us to recursively break up DFTs into N/2 point problems in a divide-and-conquer strategy, and recombine the sub-pieces. 
+
+From Wikipedia, the Cooley Tukey FFT pseduocode is as follows:
+
+~~~
+X0,...,N−1 ← ditfft2(x, N, s):      DFT of (x0, xs, x2s, ..., x(N-1)s):
+    if N = 1 then
+        X0 ← x0                     trivial size-1 DFT base case
+    else
+        X0,...,N/2−1 ← ditfft2(x, N/2, 2s)        DFT of (x0, x2s, x4s, ...)
+        XN/2,...,N−1 ← ditfft2(x+s, N/2, 2s)      DFT of (xs, xs+2s, xs+4s, ...)
+        for k = 0 to N/2−1                        combine DFTs of two halves into full DFT:
+            t ← Xk
+            Xk ← t + exp(−2πi k/N) Xk+N/2
+            Xk+N/2 ← t − exp(−2πi k/N) Xk+N/2
+        endfor
+    endif
+~~~
+
+###Parallel Fast Fourier Transform Algorithm
+
+My parallel implementation of the fast fourier transform was divided into three primary stages:
+
+1. Input reorganization
+2. Twiddle factor multiplication
+3. Butterfly
+
+The first stage simply consists of reorganizing the inputs to such an order that the outputs will be in their respective correct buckets. As it happens, the output order is just the reverse-bit order of the inputs. Bit reversal is a harder problem than one might initially realize, but fortunately Stanford hosts a page about just this (see [here](http://graphics.stanford.edu/~seander/bithacks.html)). I used the "Reverse an N bit quantity in parallel n 5 * lg(N) operations" method. 
+
+After the inputs have been reorganized, the sub-DFTs must be computed. Starting at the base case of N=2, the upper half indices are multiplied by their proper twiddle factor. Then the N values add or subract with the element N/2 away. This forms the famous "butterfly" patten, depicted below in an image from Wikipedia. 
+
+![](img/butterfly.png)
+
+These operations are conducted lgN times, each time involving O(N) complex additions/subtractions, as depicted in the flow diagram below.
+
+![](img/correctbutterfly.png)
+Source: Scientific Research Publishing
+
+The time complexity of this algorithm is thus O(NlgN). However, there is plenty of embarrassing parallelism, and thus a parallel implementation should really speed things up over a serial implementation. 
+
+##Performance Analysis
+
+My first optimization, as always was to find the generally optimal blocksize for the working implementation. No particular intermidiate value performed particulary well, so I chose blocksize of 64 as my "optimal" blocksize, against which I compared the CPU implementation. 
+
+![](img/blocksizes.png)
+
+Unfortunately for me (and everyone else hoping for an easy exploit in the embarassingly parallel department) FFTW, an acronym for Fastest Fourier Transform in the West, really lives up to its name. It completely blew away my parallel GPU implementation, even on very large inputs.
+
+![](img/implementations.png)
+
+To be fair to my unhappy little implementation, FFTW is a 100k + line monstrosity of finely tuned computation, and is generally considered the gold standard when it comes to fourier transforms. Some of the optimizations imbued in FFTW are:
+
+* Routines coded in Assembly
+* SIMD Instructions
+* Dynamic Programming techniques to select from multiple strategies for a given input and machine (including memory and cache)
+* Hard Coded Unrolled FFTs for small sizes
+
+
+##Future Work
+There is a lot of room for improvement in the FFT implementation I've done. Among these are:
+
+* Vectorization
+* Shared Memory usage
+* Generalization to non radix-2 
+
+
+##Bloopers
+
+I spent a gratuitous amount of time trying to decode GPU Gems 2's description of the algorithm, especially with regard to the Twiddle factor.
+
+![](img/gpugems.png)
+Source: NVIDIA GPU Gems 2
+
+I could not figure out for the life of me what the relationship was between the stage/index and the exponent of the presumably global Nth root of unity. Fortunately, I eventually stumbled upon this graph, which depicts the proper proceedure and generally makes sense vis a vis the actual Cooley Tukey algorithm.
+
+![](img/correctbutterfly.png)
+Source: Scientific Research Publishing
+
+###Sources
+* GPU GEMS 2
+* [YouTube](https://www.youtube.com/watch?v=EsJGuI7e_ZQ)
diff --git a/cmake/CMakeParseArguments.cmake b/cmake/CMakeParseArguments.cmake
@@ -0,0 +1,161 @@
+#.rst:
+# CMakeParseArguments
+# -------------------
+#
+#
+#
+# CMAKE_PARSE_ARGUMENTS(<prefix> <options> <one_value_keywords>
+# <multi_value_keywords> args...)
+#
+# CMAKE_PARSE_ARGUMENTS() is intended to be used in macros or functions
+# for parsing the arguments given to that macro or function.  It
+# processes the arguments and defines a set of variables which hold the
+# values of the respective options.
+#
+# The <options> argument contains all options for the respective macro,
+# i.e.  keywords which can be used when calling the macro without any
+# value following, like e.g.  the OPTIONAL keyword of the install()
+# command.
+#
+# The <one_value_keywords> argument contains all keywords for this macro
+# which are followed by one value, like e.g.  DESTINATION keyword of the
+# install() command.
+#
+# The <multi_value_keywords> argument contains all keywords for this
+# macro which can be followed by more than one value, like e.g.  the
+# TARGETS or FILES keywords of the install() command.
+#
+# When done, CMAKE_PARSE_ARGUMENTS() will have defined for each of the
+# keywords listed in <options>, <one_value_keywords> and
+# <multi_value_keywords> a variable composed of the given <prefix>
+# followed by "_" and the name of the respective keyword.  These
+# variables will then hold the respective value from the argument list.
+# For the <options> keywords this will be TRUE or FALSE.
+#
+# All remaining arguments are collected in a variable
+# <prefix>_UNPARSED_ARGUMENTS, this can be checked afterwards to see
+# whether your macro was called with unrecognized parameters.
+#
+# As an example here a my_install() macro, which takes similar arguments
+# as the real install() command:
+#
+# ::
+#
+#    function(MY_INSTALL)
+#      set(options OPTIONAL FAST)
+#      set(oneValueArgs DESTINATION RENAME)
+#      set(multiValueArgs TARGETS CONFIGURATIONS)
+#      cmake_parse_arguments(MY_INSTALL "${options}" "${oneValueArgs}"
+#                            "${multiValueArgs}" ${ARGN} )
+#      ...
+#
+#
+#
+# Assume my_install() has been called like this:
+#
+# ::
+#
+#    my_install(TARGETS foo bar DESTINATION bin OPTIONAL blub)
+#
+#
+#
+# After the cmake_parse_arguments() call the macro will have set the
+# following variables:
+#
+# ::
+#
+#    MY_INSTALL_OPTIONAL = TRUE
+#    MY_INSTALL_FAST = FALSE (this option was not used when calling my_install()
+#    MY_INSTALL_DESTINATION = "bin"
+#    MY_INSTALL_RENAME = "" (was not used)
+#    MY_INSTALL_TARGETS = "foo;bar"
+#    MY_INSTALL_CONFIGURATIONS = "" (was not used)
+#    MY_INSTALL_UNPARSED_ARGUMENTS = "blub" (no value expected after "OPTIONAL"
+#
+#
+#
+# You can then continue and process these variables.
+#
+# Keywords terminate lists of values, e.g.  if directly after a
+# one_value_keyword another recognized keyword follows, this is
+# interpreted as the beginning of the new option.  E.g.
+# my_install(TARGETS foo DESTINATION OPTIONAL) would result in
+# MY_INSTALL_DESTINATION set to "OPTIONAL", but MY_INSTALL_DESTINATION
+# would be empty and MY_INSTALL_OPTIONAL would be set to TRUE therefor.
+
+#=============================================================================
+# Copyright 2010 Alexander Neundorf <[email protected]>
+#
+# Distributed under the OSI-approved BSD License (the "License");
+# see accompanying file Copyright.txt for details.
+#
+# This software is distributed WITHOUT ANY WARRANTY; without even the
+# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# See the License for more information.
+#=============================================================================
+# (To distribute this file outside of CMake, substitute the full
+#  License text for the above reference.)
+
+
+if(__CMAKE_PARSE_ARGUMENTS_INCLUDED)
+  return()
+endif()
+set(__CMAKE_PARSE_ARGUMENTS_INCLUDED TRUE)
+
+
+function(CMAKE_PARSE_ARGUMENTS prefix _optionNames _singleArgNames _multiArgNames)
+  # first set all result variables to empty/FALSE
+  foreach(arg_name ${_singleArgNames} ${_multiArgNames})
+    set(${prefix}_${arg_name})
+  endforeach()
+
+  foreach(option ${_optionNames})
+    set(${prefix}_${option} FALSE)
+  endforeach()
+
+  set(${prefix}_UNPARSED_ARGUMENTS)
+
+  set(insideValues FALSE)
+  set(currentArgName)
+
+  # now iterate over all arguments and fill the result variables
+  foreach(currentArg ${ARGN})
+    list(FIND _optionNames "${currentArg}" optionIndex)  # ... then this marks the end of the arguments belonging to this keyword
+    list(FIND _singleArgNames "${currentArg}" singleArgIndex)  # ... then this marks the end of the arguments belonging to this keyword
+    list(FIND _multiArgNames "${currentArg}" multiArgIndex)  # ... then this marks the end of the arguments belonging to this keyword
+
+    if(${optionIndex} EQUAL -1  AND  ${singleArgIndex} EQUAL -1  AND  ${multiArgIndex} EQUAL -1)
+      if(insideValues)
+        if("${insideValues}" STREQUAL "SINGLE")
+          set(${prefix}_${currentArgName} ${currentArg})
+          set(insideValues FALSE)
+        elseif("${insideValues}" STREQUAL "MULTI")
+          list(APPEND ${prefix}_${currentArgName} ${currentArg})
+        endif()
+      else()
+        list(APPEND ${prefix}_UNPARSED_ARGUMENTS ${currentArg})
+      endif()
+    else()
+      if(NOT ${optionIndex} EQUAL -1)
+        set(${prefix}_${currentArg} TRUE)
+        set(insideValues FALSE)
+      elseif(NOT ${singleArgIndex} EQUAL -1)
+        set(currentArgName ${currentArg})
+        set(${prefix}_${currentArgName})
+        set(insideValues "SINGLE")
+      elseif(NOT ${multiArgIndex} EQUAL -1)
+        set(currentArgName ${currentArg})
+        set(${prefix}_${currentArgName})
+        set(insideValues "MULTI")
+      endif()
+    endif()
+
+  endforeach()
+
+  # propagate the result variables to the caller:
+  foreach(arg_name ${_singleArgNames} ${_multiArgNames} ${_optionNames})
+    set(${prefix}_${arg_name}  ${${prefix}_${arg_name}} PARENT_SCOPE)
+  endforeach()
+  set(${prefix}_UNPARSED_ARGUMENTS ${${prefix}_UNPARSED_ARGUMENTS} PARENT_SCOPE)
+
+endfunction()